diff --git a/.github/workflows/bazel-checks.yml b/.github/workflows/bazel-checks.yml index 7c3db4ed7865f..1b27dbc1dbc4d 100644 --- a/.github/workflows/bazel-checks.yml +++ b/.github/workflows/bazel-checks.yml @@ -52,4 +52,6 @@ jobs: working-directory: utils/bazel run: | bazelisk test --config=ci --sandbox_base="" \ + --remote_cache=https://storage.googleapis.com/$CACHE_GCS_BUCKET-bazel \ + --google_default_credentials \ @llvm-project//llvm/unittests:adt_tests diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml index 8a083f9143ec6..4f8fd7a48aff6 100644 --- a/.github/workflows/issue-write.yml +++ b/.github/workflows/issue-write.yml @@ -7,6 +7,7 @@ on: - "Check for private emails used in PRs" - "PR Request Release Note" - "Code lint" + - "CI Checks" types: - completed diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml index 6377dd53d1f6c..b92b61de05088 100644 --- a/.github/workflows/libclang-abi-tests.yml +++ b/.github/workflows/libclang-abi-tests.yml @@ -84,6 +84,8 @@ jobs: if: github.repository_owner == 'llvm' needs: abi-dump-setup runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:f80125c0f767e29b8616210c0fd5cea2cd1f4fb6f2ca86d89f6016b6329b8d7f" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:9524b37c503f strategy: matrix: name: @@ -101,17 +103,6 @@ jobs: steps: - name: Install Ninja uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get install -y abi-dumper autoconf pkg-config - - name: Install universal-ctags - run: | - git clone https://github.com/universal-ctags/ctags.git - cd ctags - ./autogen.sh - ./configure - sudo make install - name: Download source code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: @@ -139,6 +130,8 @@ jobs: abi-compare: if: github.repository_owner == 'llvm' runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:f80125c0f767e29b8616210c0fd5cea2cd1f4fb6f2ca86d89f6016b6329b8d7f" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:9524b37c503f needs: - abi-dump-setup - abi-dump @@ -154,10 +147,6 @@ jobs: name: build-latest path: build-latest - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get install -y abi-compliance-checker - name: Compare ABI run: | for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h index 41e2bd1651efd..b393c85321b7d 100644 --- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h +++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h @@ -29,6 +29,10 @@ struct CallSiteInfo { uint32_t EntryDiscriminator{0}; /// multiple entry discriminator uint64_t Count{0}; uint64_t Mispreds{0}; + // Pseudo probe information, optional + uint32_t Probe{0}; + bool Indirect = false; + uint32_t InlineTreeNode{0}; bool operator==(const CallSiteInfo &Other) const { return Offset == Other.Offset && DestId == Other.DestId && @@ -63,6 +67,9 @@ template <> struct MappingTraits { YamlIO.mapOptional("disc", CSI.EntryDiscriminator, (uint32_t)0); YamlIO.mapRequired("cnt", CSI.Count); YamlIO.mapOptional("mis", CSI.Mispreds, (uint64_t)0); + YamlIO.mapOptional("pp", CSI.Probe, 0); + YamlIO.mapOptional("ppn", CSI.InlineTreeNode, 0); + YamlIO.mapOptional("ind", CSI.Indirect, false); } static const bool flow = true; @@ -95,29 +102,20 @@ template <> struct MappingTraits { namespace bolt { struct PseudoProbeInfo { - uint32_t InlineTreeIndex = 0; - uint64_t BlockMask = 0; // bitset with probe indices from 1 to 64 - std::vector BlockProbes; // block probes with indices above 64 - std::vector CallProbes; - std::vector IndCallProbes; + std::vector BlockProbes; std::vector InlineTreeNodes; bool operator==(const PseudoProbeInfo &Other) const { - return InlineTreeIndex == Other.InlineTreeIndex && - BlockProbes == Other.BlockProbes && CallProbes == Other.CallProbes && - IndCallProbes == Other.IndCallProbes; + return InlineTreeNodes == Other.InlineTreeNodes && + BlockProbes == Other.BlockProbes; } }; } // end namespace bolt template <> struct MappingTraits { static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) { - YamlIO.mapOptional("blx", PI.BlockMask, 0); - YamlIO.mapOptional("blk", PI.BlockProbes, std::vector()); - YamlIO.mapOptional("call", PI.CallProbes, std::vector()); - YamlIO.mapOptional("icall", PI.IndCallProbes, std::vector()); - YamlIO.mapOptional("id", PI.InlineTreeIndex, 0); - YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector()); + YamlIO.mapOptional("blk", PI.BlockProbes, std::vector(1, 1)); + YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector(1, 0)); } static const bool flow = true; diff --git a/bolt/include/bolt/Profile/YAMLProfileWriter.h b/bolt/include/bolt/Profile/YAMLProfileWriter.h index d4d7217464cc8..50ee78d342df8 100644 --- a/bolt/include/bolt/Profile/YAMLProfileWriter.h +++ b/bolt/include/bolt/Profile/YAMLProfileWriter.h @@ -74,25 +74,24 @@ class YAMLProfileWriter { collectInlineTree(const MCPseudoProbeDecoder &Decoder, const MCDecodedPseudoProbeInlineTree &Root); - // 0 - block probe, 1 - indirect call, 2 - direct call - using ProbeList = std::array, 3>; - using NodeIdToProbes = DenseMap; - static std::vector - convertNodeProbes(NodeIdToProbes &NodeProbes); - public: - template - static std::vector - writeBlockProbes(T Probes, const InlineTreeMapTy &InlineTreeNodeId) { - NodeIdToProbes NodeProbes; - for (const MCDecodedPseudoProbe &Probe : Probes) { - auto It = InlineTreeNodeId.find(Probe.getInlineTreeNode()); - if (It == InlineTreeNodeId.end()) - continue; - NodeProbes[It->second][Probe.getType()].emplace_back(Probe.getIndex()); - } - return convertNodeProbes(NodeProbes); - } + class BlockProbeCtx { + struct Call { + uint64_t Id; + uint32_t Node; + bool Indirect; + bool Used; + }; + // Group block probes by node id. + DenseMap> NodeToProbes; + // Offset -> call probe + DenseMap CallProbes; + + public: + void addBlockProbe(const InlineTreeMapTy &Map, + const MCDecodedPseudoProbe &Probe, uint32_t ProbeOffset); + void finalize(yaml::bolt::BinaryBasicBlockProfile &YamlBB); + }; }; } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index cafe4bfebf19d..6b969011df589 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -2397,10 +2397,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, PseudoProbeDecoder->getAddress2ProbesMap(); BinaryFunction::FragmentsSetTy Fragments(BF->Fragments); Fragments.insert(BF); - DenseMap< - uint32_t, - std::vector>> - BlockProbes; + DenseMap BlockCtx; for (const BinaryFunction *F : Fragments) { const uint64_t FuncAddr = F->getAddress(); for (const MCDecodedPseudoProbe &Probe : @@ -2408,15 +2405,14 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, const uint32_t OutputAddress = Probe.getAddress(); const uint32_t InputOffset = BAT->translate( FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true); - const unsigned BlockIndex = getBlock(InputOffset).second; - BlockProbes[BlockIndex].emplace_back(Probe); + const auto &[BlockOffset, BlockIndex] = getBlock(InputOffset); + BlockCtx[BlockIndex].addBlockProbe(InlineTreeNodeId, Probe, + InputOffset - BlockOffset); } } - for (auto &[Block, Probes] : BlockProbes) { - YamlBF.Blocks[Block].PseudoProbes = - YAMLProfileWriter::writeBlockProbes(Probes, InlineTreeNodeId); - } + for (auto &[Block, Ctx] : BlockCtx) + Ctx.finalize(YamlBF.Blocks[Block]); } // Skip printing if there's no profile data llvm::erase_if( diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index 1a61949d77472..5fb65153cf313 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -348,26 +348,10 @@ class StaleMatcher { return It->second; }; - auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo - &ProfileProbe, - uint32_t NodeId) { - for (uint64_t Index = 0; Index < 64; ++Index) - if (ProfileProbe.BlockMask & 1ull << Index) - ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)]; - for (const auto &ProfileProbes : - {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes, - ProfileProbe.CallProbes}) - for (uint64_t ProfileProbe : ProfileProbes) - ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, ProfileProbe)]; - }; - - for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) { - if (!ProfileProbe.InlineTreeNodes.empty()) - for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes) - matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode); - else - matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex); - } + for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) + for (uint32_t Node : ProfileProbe.InlineTreeNodes) + for (uint64_t Probe : ProfileProbe.BlockProbes) + ++FlowBlockMatchCount[matchProfileProbeToBlock(Node, Probe)]; uint32_t BestMatchCount = 0; uint32_t TotalMatchCount = 0; const FlowBlock *BestMatchBlock = nullptr; diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp index 5c631f93f01da..cd4e77b0dbb60 100644 --- a/bolt/lib/Profile/YAMLProfileWriter.cpp +++ b/bolt/lib/Profile/YAMLProfileWriter.cpp @@ -129,50 +129,62 @@ YAMLProfileWriter::convertPseudoProbeDesc(const MCPseudoProbeDecoder &Decoder) { return {Desc, InlineTree}; } -std::vector -YAMLProfileWriter::convertNodeProbes(NodeIdToProbes &NodeProbes) { - struct BlockProbeInfoHasher { - size_t operator()(const yaml::bolt::PseudoProbeInfo &BPI) const { - return llvm::hash_combine(llvm::hash_combine_range(BPI.BlockProbes), - llvm::hash_combine_range(BPI.CallProbes), - llvm::hash_combine_range(BPI.IndCallProbes)); +void YAMLProfileWriter::BlockProbeCtx::addBlockProbe( + const InlineTreeMapTy &Map, const MCDecodedPseudoProbe &Probe, + uint32_t ProbeOffset) { + auto It = Map.find(Probe.getInlineTreeNode()); + if (It == Map.end()) + return; + auto NodeId = It->second; + uint32_t Index = Probe.getIndex(); + if (Probe.isCall()) + CallProbes[ProbeOffset] = + Call{Index, NodeId, Probe.isIndirectCall(), false}; + else + NodeToProbes[NodeId].emplace_back(Index); +} + +void YAMLProfileWriter::BlockProbeCtx::finalize( + yaml::bolt::BinaryBasicBlockProfile &YamlBB) { + // Hash block probes by vector + struct ProbeHasher { + size_t operator()(const ArrayRef Probes) const { + return llvm::hash_combine_range(Probes); } }; - // Check identical BlockProbeInfo structs and merge them - std::unordered_map, - BlockProbeInfoHasher> - BPIToNodes; - for (auto &[NodeId, Probes] : NodeProbes) { - yaml::bolt::PseudoProbeInfo BPI; - BPI.BlockProbes = std::vector(Probes[0].begin(), Probes[0].end()); - BPI.IndCallProbes = std::vector(Probes[1].begin(), Probes[1].end()); - BPI.CallProbes = std::vector(Probes[2].begin(), Probes[2].end()); - BPIToNodes[BPI].push_back(NodeId); + // Check identical block probes and merge them + std::unordered_map, std::vector, ProbeHasher> + ProbesToNodes; + for (auto &[NodeId, Probes] : NodeToProbes) { + llvm::sort(Probes); + ProbesToNodes[Probes].emplace_back(NodeId); } - - auto handleMask = [](const auto &Ids, auto &Vec, auto &Mask) { - for (auto Id : Ids) - if (Id > 64) - Vec.emplace_back(Id); - else - Mask |= 1ull << (Id - 1); - }; - - // Add to YAML with merged nodes/block mask optimizations - std::vector YamlProbes; - YamlProbes.reserve(BPIToNodes.size()); - for (const auto &[BPI, Nodes] : BPIToNodes) { - auto &YamlBPI = YamlProbes.emplace_back(yaml::bolt::PseudoProbeInfo()); - YamlBPI.CallProbes = BPI.CallProbes; - YamlBPI.IndCallProbes = BPI.IndCallProbes; - if (Nodes.size() == 1) - YamlBPI.InlineTreeIndex = Nodes.front(); - else - YamlBPI.InlineTreeNodes = Nodes; - handleMask(BPI.BlockProbes, YamlBPI.BlockProbes, YamlBPI.BlockMask); + for (auto &[Probes, Nodes] : ProbesToNodes) { + llvm::sort(Nodes); + YamlBB.PseudoProbes.emplace_back( + yaml::bolt::PseudoProbeInfo{Probes, Nodes}); + } + for (yaml::bolt::CallSiteInfo &CSI : YamlBB.CallSites) { + auto It = CallProbes.find(CSI.Offset); + if (It == CallProbes.end()) + continue; + Call &Probe = It->second; + CSI.Probe = Probe.Id; + CSI.InlineTreeNode = Probe.Node; + CSI.Indirect = Probe.Indirect; + Probe.Used = true; + } + for (const auto &[Offset, Probe] : CallProbes) { + if (Probe.Used) + continue; + yaml::bolt::CallSiteInfo CSI; + CSI.Offset = Offset; + CSI.Probe = Probe.Id; + CSI.InlineTreeNode = Probe.Node; + CSI.Indirect = Probe.Indirect; + YamlBB.CallSites.emplace_back(CSI); } - return YamlProbes; } std::tuple, @@ -343,12 +355,13 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS, const AddressProbesMap &ProbeMap = PseudoProbeDecoder->getAddress2ProbesMap(); const uint64_t FuncAddr = BF.getAddress(); - const std::pair &BlockRange = - BB->getInputAddressRange(); - const std::pair BlockAddrRange = { - FuncAddr + BlockRange.first, FuncAddr + BlockRange.second}; - auto Probes = ProbeMap.find(BlockAddrRange.first, BlockAddrRange.second); - YamlBB.PseudoProbes = writeBlockProbes(Probes, InlineTreeNodeId); + auto [Start, End] = BB->getInputAddressRange(); + Start += FuncAddr; + End += FuncAddr; + BlockProbeCtx Ctx; + for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(Start, End)) + Ctx.addBlockProbe(InlineTreeNodeId, Probe, Probe.getAddress() - Start); + Ctx.finalize(YamlBB); } YamlBF.Blocks.emplace_back(YamlBB); diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test b/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test index accb4742851ea..9224cf163dbcc 100644 --- a/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test +++ b/bolt/test/X86/match-blocks-with-pseudo-probes-inline.test @@ -30,7 +30,7 @@ functions: insns: 11 hash: 0x1 exec: 1 - probes: [ { blx: 9 } ] + probes: [ { blk: [ 1, 4 ] } ] inline_tree: [ { } ] - name: foo fid: 10 @@ -43,7 +43,7 @@ functions: hash: 0x2 exec: 1 succ: [ { bid: 3, cnt: 0 } ] - probes: [ { blx: 3 } ] + probes: [ { blk: [ 1, 2 ] } ] inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ] - name: main fid: 11 @@ -56,7 +56,7 @@ functions: hash: 0x3 exec: 1 succ: [ { bid: 3, cnt: 0 } ] - probes: [ { blx: 3, id: 1 }, { blx: 1 } ] + probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { blk: [ 1 ] } ] inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ] pseudo_probe_desc: gs: [ 0xE413754A191DB537, 0x5CF8C24CDB18BDAC, 0xDB956436E78DD5FA ] diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test index 40cb64ee82919..7be327d698b17 100644 --- a/bolt/test/X86/match-blocks-with-pseudo-probes.test +++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test @@ -55,7 +55,7 @@ functions: hash: 0xFFFFFFFFFFFFFFF1 insns: 1 succ: [ { bid: 3, cnt: 1} ] - probes: [ { blx: 1 } ] + probes: [ { blk: [ 1 ] } ] inline_tree: [ { g: 0 } ] pseudo_probe_desc: gs: [ 0xDB956436E78DD5FA ] diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test index e5e8aadc18f9e..9748fc1b6a4d4 100644 --- a/bolt/test/X86/pseudoprobe-decoding-inline.test +++ b/bolt/test/X86/pseudoprobe-decoding-inline.test @@ -14,17 +14,17 @@ # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 9 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 4 ] } ] # CHECK-YAML: inline_tree: [ { } ] # # CHECK-YAML: name: foo # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ] } ] # CHECK-YAML: inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ] # # CHECK-YAML: name: main # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3, id: 1 }, { blx: 1 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { } ] # CHECK-YAML: inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ] # # CHECK-YAML: pseudo_probe_desc: diff --git a/bolt/test/X86/pseudoprobe-decoding-noinline.test b/bolt/test/X86/pseudoprobe-decoding-noinline.test index 36a2fab74e857..4ba51cdc96f9e 100644 --- a/bolt/test/X86/pseudoprobe-decoding-noinline.test +++ b/bolt/test/X86/pseudoprobe-decoding-noinline.test @@ -15,17 +15,18 @@ # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 9 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 4 ] } ] # CHECK-YAML: inline_tree: [ { } ] # # CHECK-YAML: name: foo # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ] } ] # CHECK-YAML: inline_tree: [ { g: 2 } ] # # CHECK-YAML: name: main # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 1, call: [ 2 ] } ] +# CHECK-YAML: calls: [ { off: 0x4, fid: 0, cnt: 0, pp: 2 } ] +# CHECK-YAML: probes: [ { } ] # CHECK-YAML: inline_tree: [ { g: 1 } ] # # CHECK-YAML: pseudo_probe_desc: diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 10a8d095fede3..f7e6061044c6d 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -559,7 +559,7 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Clarifications to Fortran map semantics | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| default clause at target construct | :part:`In Progress` | :none:`unclaimed` | | +| default clause at target construct | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/162910 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | ref count update use_device_{ptr, addr} | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 333aae74db4f8..98ced0b757314 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -588,6 +588,9 @@ Improvements to Clang's diagnostics - Diagnostics messages now refer to ``structured binding`` instead of ``decomposition``, to align with `P0615R0 `_ changing the term. (#GH157880) +- Clang now suppresses runtime behavior warnings for unreachable code in file-scope + variable initializers, matching the behavior for functions. This prevents false + positives for operations in unreachable branches of constant expressions. - Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``). Moved the warning for a missing (though implied) attribute on a redeclaration into this group. @@ -971,6 +974,7 @@ OpenMP Support - Added support for 'omp fuse' directive. - Updated parsing and semantic analysis support for ``nowait`` clause to accept optional argument in OpenMP >= 60. +- Added support for ``default`` clause on ``target`` directive. Improvements ^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 90e1f8d1eb5e9..52360b67b306c 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -54,7 +54,7 @@ CODEGENOPT(SeparateNamedSections, 1, 0, Benign) ///< Set for -fseparate-named-se CODEGENOPT(EnableAIXExtendedAltivecABI, 1, 0, Benign) ///< Set for -mabi=vec-extabi. Enables the extended Altivec ABI on AIX. CODEGENOPT(XCOFFReadOnlyPointers, 1, 0, Benign) ///< Set for -mxcoff-roptr. CODEGENOPT(AllTocData, 1, 0, Benign) ///< AIX -mtocdata -ENUM_CODEGENOPT(FramePointer, FramePointerKind, 2, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,reserved,none +ENUM_CODEGENOPT(FramePointer, FramePointerKind, 3, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,non-leaf-no-reserve,reserved,none ENUM_CODEGENOPT(ExceptionHandling, ExceptionHandlingKind, 3, ExceptionHandlingKind::None, NotCompatible) diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index 65d47c5e8b31d..b48adf046a37a 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -155,10 +155,13 @@ class CodeGenOptions : public CodeGenOptionsBase { std::string BinutilsVersion; enum class FramePointerKind { - None, // Omit all frame pointers. - Reserved, // Maintain valid frame pointer chain. - NonLeaf, // Keep non-leaf frame pointers. - All, // Keep all frame pointers. + NonLeafNoReserve, // Keep non-leaf frame pointers, allow the FP to be used + // as a GPR in leaf functions. + None, // Omit all frame pointers. + Reserved, // Maintain valid frame pointer chain. + NonLeaf, // Keep non-leaf frame pointers, don't allow the FP to be used as a + // GPR in leaf functions. + All, // Keep all frame pointers. }; static StringRef getFramePointerKindName(FramePointerKind Kind) { @@ -167,6 +170,8 @@ class CodeGenOptions : public CodeGenOptionsBase { return "none"; case FramePointerKind::Reserved: return "reserved"; + case FramePointerKind::NonLeafNoReserve: + return "non-leaf-no-reserve"; case FramePointerKind::NonLeaf: return "non-leaf"; case FramePointerKind::All: diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index a419fdcdd8493..a979578f0073b 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -5797,6 +5797,9 @@ def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group; def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group, HelpText<"Omit frame pointer setup for leaf functions">; +def mno_reserve_frame_pointer_reg : Flag<["-"], "mno-reserve-frame-pointer-reg">, Group; +def mreserve_frame_pointer_reg : Flag<["-"], "mreserve-frame-pointer-reg">, Group, + HelpText<"Reserve the frame pointer register even if the function doesn't have a frame">; def moslib_EQ : Joined<["-"], "moslib=">, Group; def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias; def mred_zone : Flag<["-"], "mred-zone">, Group; @@ -8654,8 +8657,8 @@ def pic_is_pie : Flag<["-"], "pic-is-pie">, MarshallingInfoFlag>; def mframe_pointer_EQ : Joined<["-"], "mframe-pointer=">, - HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,reserved,none">, - NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "Reserved", "None"]>, + HelpText<"Specify which frame pointers to retain.">, Values<"all,non-leaf,non-leaf-no-reserve,reserved,none">, + NormalizedValuesScope<"CodeGenOptions::FramePointerKind">, NormalizedValues<["All", "NonLeaf", "NonLeafNoReserve", "Reserved", "None"]>, MarshallingInfoEnum, "None">; diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index dad8efd0f017f..58eb1c0a7c114 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -5223,11 +5223,7 @@ class Parser : public CodeCompletionHandler { /// assignment-expression /// '{' ... /// \endverbatim - ExprResult ParseInitializer() { - if (Tok.isNot(tok::l_brace)) - return ParseAssignmentExpression(); - return ParseBraceInitializer(); - } + ExprResult ParseInitializer(Decl *DeclForInitializer = nullptr); /// MayBeDesignationStart - Return true if the current token might be the /// start of a designator. If we can tell it is impossible that it is a diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h index 4103c3f006a8f..20a2030f56034 100644 --- a/clang/include/clang/Sema/AnalysisBasedWarnings.h +++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h @@ -14,15 +14,19 @@ #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H #include "clang/AST/Decl.h" +#include "clang/Sema/ScopeInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include namespace clang { +class AnalysisDeclContext; class Decl; class FunctionDecl; class QualType; class Sema; +class VarDecl; namespace sema { class FunctionScopeInfo; class SemaPPCallbacks; @@ -57,6 +61,8 @@ class AnalysisBasedWarnings { enum VisitFlag { NotVisited = 0, Visited = 1, Pending = 2 }; llvm::DenseMap VisitedFD; + std::multimap + VarDeclPossiblyUnreachableDiags; Policy PolicyOverrides; void clearOverrides(); @@ -107,6 +113,10 @@ class AnalysisBasedWarnings { // Issue warnings that require whole-translation-unit analysis. void IssueWarnings(TranslationUnitDecl *D); + void registerVarDeclWarning(VarDecl *VD, PossiblyUnreachableDiag PUD); + + void issueWarningsForRegisteredVarDecl(VarDecl *VD); + // Gets the default policy which is in effect at the given source location. Policy getPolicyInEffectAt(SourceLocation Loc); diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 0470645a9e7ad..163ab32fafa48 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -6756,6 +6756,11 @@ class Sema final : public SemaBase { /// suffice, e.g., in a default function argument. Decl *ManglingContextDecl; + /// Declaration for initializer if one is currently being + /// parsed. Used when an expression has a possibly unreachable + /// diagnostic to reference the declaration as a whole. + VarDecl *DeclForInitializer = nullptr; + /// If we are processing a decltype type, a set of call expressions /// for which we have deferred checking the completeness of the return type. SmallVector DelayedDecltypeCalls; diff --git a/clang/lib/Analysis/AnalysisDeclContext.cpp b/clang/lib/Analysis/AnalysisDeclContext.cpp index 5a52056f3e6a5..f188fc6921ed1 100644 --- a/clang/lib/Analysis/AnalysisDeclContext.cpp +++ b/clang/lib/Analysis/AnalysisDeclContext.cpp @@ -117,6 +117,11 @@ Stmt *AnalysisDeclContext::getBody(bool &IsAutosynthesized) const { return BD->getBody(); else if (const auto *FunTmpl = dyn_cast_or_null(D)) return FunTmpl->getTemplatedDecl()->getBody(); + else if (const auto *VD = dyn_cast_or_null(D)) { + if (VD->isFileVarDecl()) { + return const_cast(dyn_cast_or_null(VD->getInit())); + } + } llvm_unreachable("unknown code decl"); } diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 4cb47cb5aea5a..56a9a29f86c67 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1992,6 +1992,7 @@ static void getTrivialDefaultFunctionAttributes( // This is the default behavior. break; case CodeGenOptions::FramePointerKind::Reserved: + case CodeGenOptions::FramePointerKind::NonLeafNoReserve: case CodeGenOptions::FramePointerKind::NonLeaf: case CodeGenOptions::FramePointerKind::All: FuncAttrs.addAttribute("frame-pointer", diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index e392a12044a39..4bdba9b3da502 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -261,12 +261,12 @@ static std::optional initializeLocalResourceArray( llvm::Type * CGHLSLRuntime::convertHLSLSpecificType(const Type *T, - SmallVector *Packoffsets) { + const CGHLSLOffsetInfo &OffsetInfo) { assert(T->isHLSLSpecificType() && "Not an HLSL specific type!"); // Check if the target has a specific translation for this type first. if (llvm::Type *TargetTy = - CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, Packoffsets)) + CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, OffsetInfo)) return TargetTy; llvm_unreachable("Generic handling of HLSL types is not supported."); @@ -357,25 +357,14 @@ createBufferHandleType(const HLSLBufferDecl *BufDecl) { return cast(QT.getTypePtr()); } -// Iterates over all declarations in the HLSL buffer and based on the -// packoffset or register(c#) annotations it fills outs the Layout -// vector with the user-specified layout offsets. -// The buffer offsets can be specified 2 ways: -// 1. declarations in cbuffer {} block can have a packoffset annotation -// (translates to HLSLPackOffsetAttr) -// 2. default constant buffer declarations at global scope can have -// register(c#) annotations (translates to HLSLResourceBindingAttr with -// RegisterType::C) -// It is not guaranteed that all declarations in a buffer have an annotation. -// For those where it is not specified a -1 value is added to the Layout -// vector. In the final layout these declarations will be placed at the end -// of the HLSL buffer after all of the elements with specified offset. -static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl, - SmallVector &Layout) { - assert(Layout.empty() && "expected empty vector for layout"); - assert(BufDecl->hasValidPackoffset()); +CGHLSLOffsetInfo CGHLSLOffsetInfo::fromDecl(const HLSLBufferDecl &BufDecl) { + CGHLSLOffsetInfo Result; - for (Decl *D : BufDecl->buffer_decls()) { + // If we don't have packoffset info, just return an empty result. + if (!BufDecl.hasValidPackoffset()) + return Result; + + for (Decl *D : BufDecl.buffer_decls()) { if (isa(D) || isa(D)) { continue; } @@ -384,11 +373,11 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl, continue; if (!VD->hasAttrs()) { - Layout.push_back(-1); + Result.Offsets.push_back(Unspecified); continue; } - int32_t Offset = -1; + uint32_t Offset = Unspecified; for (auto *Attr : VD->getAttrs()) { if (auto *POA = dyn_cast(Attr)) { Offset = POA->getOffsetInBytes(); @@ -401,8 +390,9 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl, break; } } - Layout.push_back(Offset); + Result.Offsets.push_back(Offset); } + return Result; } // Codegen for HLSLBufferDecl @@ -419,13 +409,9 @@ void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) { return; // create global variable for the constant buffer - SmallVector Layout; - if (BufDecl->hasValidPackoffset()) - fillPackoffsetLayout(BufDecl, Layout); - - llvm::TargetExtType *TargetTy = - cast(convertHLSLSpecificType( - ResHandleTy, BufDecl->hasValidPackoffset() ? &Layout : nullptr)); + CGHLSLOffsetInfo OffsetInfo = CGHLSLOffsetInfo::fromDecl(*BufDecl); + llvm::TargetExtType *TargetTy = cast( + convertHLSLSpecificType(ResHandleTy, OffsetInfo)); llvm::GlobalVariable *BufGV = new GlobalVariable( TargetTy, /*isConstant*/ false, GlobalValue::LinkageTypes::ExternalLinkage, PoisonValue::get(TargetTy), diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index 9d31714ab8606..488a322ca7569 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -81,6 +81,33 @@ class CodeGenModule; class CodeGenFunction; class LValue; +class CGHLSLOffsetInfo { + SmallVector Offsets; + +public: + static const uint32_t Unspecified = ~0U; + + /// Iterates over all declarations in the HLSL buffer and based on the + /// packoffset or register(c#) annotations it fills outs the Offsets vector + /// with the user-specified layout offsets. The buffer offsets can be + /// specified 2 ways: 1. declarations in cbuffer {} block can have a + /// packoffset annotation (translates to HLSLPackOffsetAttr) 2. default + /// constant buffer declarations at global scope can have register(c#) + /// annotations (translates to HLSLResourceBindingAttr with RegisterType::C) + /// It is not guaranteed that all declarations in a buffer have an annotation. + /// For those where it is not specified a `~0U` value is added to the Offsets + /// vector. In the final layout these declarations will be placed at the end + /// of the HLSL buffer after all of the elements with specified offset. + static CGHLSLOffsetInfo fromDecl(const HLSLBufferDecl &BufDecl); + + /// Get the given offset, or `~0U` if there is no offset for the member. + uint32_t operator[](size_t I) const { + if (Offsets.empty()) + return Unspecified; + return Offsets[I]; + } +}; + class CGHLSLRuntime { public: //===----------------------------------------------------------------------===// @@ -167,9 +194,11 @@ class CGHLSLRuntime { CGHLSLRuntime(CodeGenModule &CGM) : CGM(CGM) {} virtual ~CGHLSLRuntime() {} - llvm::Type * - convertHLSLSpecificType(const Type *T, - SmallVector *Packoffsets = nullptr); + llvm::Type *convertHLSLSpecificType(const Type *T, + const CGHLSLOffsetInfo &OffsetInfo); + llvm::Type *convertHLSLSpecificType(const Type *T) { + return convertHLSLSpecificType(T, CGHLSLOffsetInfo()); + } void generateGlobalCtorDtorCalls(); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 84cc47e3ee600..c585f1d41be15 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1516,6 +1516,9 @@ void CodeGenModule::Release() { case CodeGenOptions::FramePointerKind::Reserved: getModule().setFramePointer(llvm::FramePointerKind::Reserved); break; + case CodeGenOptions::FramePointerKind::NonLeafNoReserve: + getModule().setFramePointer(llvm::FramePointerKind::NonLeafNoReserve); + break; case CodeGenOptions::FramePointerKind::NonLeaf: getModule().setFramePointer(llvm::FramePointerKind::NonLeaf); break; diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp index 838903cdcd1ee..4bc6d565fd41f 100644 --- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp +++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp @@ -66,8 +66,9 @@ namespace CodeGen { // annotation though. For those that don't, the PackOffsets array will contain // -1 value instead. These elements must be placed at the end of the layout // after all of the elements with specific offset. -llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType( - const RecordType *RT, const llvm::SmallVector *PackOffsets) { +llvm::TargetExtType * +HLSLBufferLayoutBuilder::createLayoutType(const RecordType *RT, + const CGHLSLOffsetInfo &OffsetInfo) { // check if we already have the layout type for this struct if (llvm::TargetExtType *Ty = @@ -101,14 +102,10 @@ llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType( const CXXRecordDecl *RD = RecordDecls.pop_back_val(); for (const auto *FD : RD->fields()) { - assert((!PackOffsets || Index < PackOffsets->size()) && - "number of elements in layout struct does not match number of " - "packoffset annotations"); - // No PackOffset info at all, or have a valid packoffset/register(c#) // annotations value -> layout the field. - const int PO = PackOffsets ? (*PackOffsets)[Index++] : -1; - if (!PackOffsets || PO != -1) { + const uint32_t PO = OffsetInfo[Index++]; + if (PO != CGHLSLOffsetInfo::Unspecified) { if (!layoutField(FD, EndOffset, FieldOffset, FieldType, PO)) return nullptr; Layout.push_back(FieldOffset); @@ -175,7 +172,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, unsigned &EndOffset, unsigned &FieldOffset, llvm::Type *&FieldType, - int Packoffset) { + uint32_t Packoffset) { // Size of element; for arrays this is a size of a single element in the // array. Total array size of calculated as (ArrayCount-1) * ArrayStride + @@ -201,8 +198,9 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, // For array of structures, create a new array with a layout type // instead of the structure type. if (Ty->isStructureOrClassType()) { + CGHLSLOffsetInfo EmptyOffsets; llvm::Type *NewTy = cast( - createLayoutType(Ty->getAsCanonical())); + createLayoutType(Ty->getAsCanonical(), EmptyOffsets)); if (!NewTy) return false; assert(isa(NewTy) && "expected target type"); @@ -216,17 +214,20 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, ElemLayoutTy = CGM.getTypes().ConvertTypeForMem(FieldTy); } ArrayStride = llvm::alignTo(ElemSize, CBufferRowSizeInBytes); - ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset; + ElemOffset = (Packoffset != CGHLSLOffsetInfo::Unspecified) ? Packoffset + : NextRowOffset; } else if (FieldTy->isStructureOrClassType()) { // Create a layout type for the structure + CGHLSLOffsetInfo EmptyOffsets; ElemLayoutTy = createLayoutType( - cast(FieldTy->getAsCanonical())); + cast(FieldTy->getAsCanonical()), EmptyOffsets); if (!ElemLayoutTy) return false; assert(isa(ElemLayoutTy) && "expected target type"); ElemSize = cast(ElemLayoutTy)->getIntParameter(0); - ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset; + ElemOffset = (Packoffset != CGHLSLOffsetInfo::Unspecified) ? Packoffset + : NextRowOffset; } else { // scalar or vector - find element size and alignment @@ -246,7 +247,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, } // calculate or get element offset for the vector or scalar - if (Packoffset != -1) { + if (Packoffset != CGHLSLOffsetInfo::Unspecified) { ElemOffset = Packoffset; } else { ElemOffset = llvm::alignTo(EndOffset, Align); @@ -269,5 +270,13 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, return true; } +bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, + unsigned &EndOffset, + unsigned &FieldOffset, + llvm::Type *&FieldType) { + return layoutField(FD, EndOffset, FieldOffset, FieldType, + CGHLSLOffsetInfo::Unspecified); +} + } // namespace CodeGen } // namespace clang diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h index 61240b280cfcb..916e60e83e2c0 100644 --- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h +++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h @@ -14,6 +14,7 @@ class RecordType; class FieldDecl; namespace CodeGen { +class CGHLSLOffsetInfo; class CodeGenModule; //===----------------------------------------------------------------------===// @@ -33,14 +34,15 @@ class HLSLBufferLayoutBuilder { // Returns LLVM target extension type with the name LayoutTypeName // for given structure type and layout data. The first number in // the Layout is the size followed by offsets for each struct element. - llvm::TargetExtType * - createLayoutType(const RecordType *StructType, - const llvm::SmallVector *Packoffsets = nullptr); + llvm::TargetExtType *createLayoutType(const RecordType *StructType, + const CGHLSLOffsetInfo &OffsetInfo); private: bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset, unsigned &FieldOffset, llvm::Type *&FieldType, - int Packoffset = -1); + uint32_t Packoffset); + bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset, + unsigned &FieldOffset, llvm::Type *&FieldType); }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index f63e900669d97..383f52f298d2e 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -39,6 +39,7 @@ class ABIInfo; class CallArgList; class CodeGenFunction; class CGBlockInfo; +class CGHLSLOffsetInfo; class SwiftABIInfo; /// TargetCodeGenInfo - This class organizes various target-specific @@ -442,9 +443,8 @@ class TargetCodeGenInfo { } /// Return an LLVM type that corresponds to a HLSL type - virtual llvm::Type * - getHLSLType(CodeGenModule &CGM, const Type *T, - const SmallVector *Packoffsets = nullptr) const { + virtual llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T, + const CGHLSLOffsetInfo &OffsetInfo) const { return nullptr; } diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp index b4cebb9a32aca..f30b30284cb12 100644 --- a/clang/lib/CodeGen/Targets/DirectX.cpp +++ b/clang/lib/CodeGen/Targets/DirectX.cpp @@ -29,14 +29,13 @@ class DirectXTargetCodeGenInfo : public TargetCodeGenInfo { DirectXTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT) : TargetCodeGenInfo(std::make_unique(CGT)) {} - llvm::Type * - getHLSLType(CodeGenModule &CGM, const Type *T, - const SmallVector *Packoffsets = nullptr) const override; + llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T, + const CGHLSLOffsetInfo &OffsetInfo) const override; }; llvm::Type *DirectXTargetCodeGenInfo::getHLSLType( CodeGenModule &CGM, const Type *Ty, - const SmallVector *Packoffsets) const { + const CGHLSLOffsetInfo &OffsetInfo) const { auto *ResType = dyn_cast(Ty); if (!ResType) return nullptr; @@ -78,7 +77,7 @@ llvm::Type *DirectXTargetCodeGenInfo::getHLSLType( llvm::Type *BufferLayoutTy = HLSLBufferLayoutBuilder(CGM, "dx.Layout") .createLayoutType(ContainedTy->castAsCanonical(), - Packoffsets); + OffsetInfo); if (!BufferLayoutTy) return nullptr; diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index abd049aca0ed7..be7e9ccecae9f 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -53,9 +53,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo { unsigned getDeviceKernelCallingConv() const override; llvm::Type *getOpenCLType(CodeGenModule &CGM, const Type *T) const override; - llvm::Type * - getHLSLType(CodeGenModule &CGM, const Type *Ty, - const SmallVector *Packoffsets = nullptr) const override; + llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *Ty, + const CGHLSLOffsetInfo &OffsetInfo) const override; llvm::Type *getSPIRVImageTypeFromHLSLResource( const HLSLAttributedResourceType::Attributes &attributes, QualType SampledType, CodeGenModule &CGM) const; @@ -260,8 +259,16 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM, LangAS AS = QT->getUnqualifiedDesugaredType()->isNullPtrType() ? LangAS::Default : QT->getPointeeType().getAddressSpace(); + unsigned ASAsInt = static_cast(AS); + unsigned FirstTargetASAsInt = + static_cast(LangAS::FirstTargetAddressSpace); + unsigned CodeSectionINTELAS = FirstTargetASAsInt + 9; + // As per SPV_INTEL_function_pointers, it is illegal to addrspacecast + // function pointers to/from the generic AS. + bool IsFunctionPtrAS = + CGM.getTriple().isSPIRV() && ASAsInt == CodeSectionINTELAS; if (AS == LangAS::Default || AS == LangAS::opencl_generic || - AS == LangAS::opencl_constant) + AS == LangAS::opencl_constant || IsFunctionPtrAS) return llvm::ConstantPointerNull::get(PT); auto &Ctx = CGM.getContext(); @@ -510,7 +517,7 @@ static llvm::Type *getInlineSpirvType(CodeGenModule &CGM, llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType( CodeGenModule &CGM, const Type *Ty, - const SmallVector *Packoffsets) const { + const CGHLSLOffsetInfo &OffsetInfo) const { llvm::LLVMContext &Ctx = CGM.getLLVMContext(); if (auto *SpirvType = dyn_cast(Ty)) @@ -559,7 +566,7 @@ llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType( llvm::Type *BufferLayoutTy = HLSLBufferLayoutBuilder(CGM, "spirv.Layout") .createLayoutType(ContainedTy->castAsCanonical(), - Packoffsets); + OffsetInfo); uint32_t StorageClass = /* Uniform storage class */ 2; return llvm::TargetExtType::get(Ctx, "spirv.VulkanBuffer", {BufferLayoutTy}, {StorageClass, false}); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f1389ed42cd7a..b1e323ffc1a28 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5787,6 +5787,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, case CodeGenOptions::FramePointerKind::Reserved: FPKeepKindStr = "-mframe-pointer=reserved"; break; + case CodeGenOptions::FramePointerKind::NonLeafNoReserve: + FPKeepKindStr = "-mframe-pointer=non-leaf-no-reserve"; + break; case CodeGenOptions::FramePointerKind::NonLeaf: FPKeepKindStr = "-mframe-pointer=non-leaf"; break; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 2b279387d2c14..d58735294aa62 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -240,26 +240,39 @@ static bool framePointerImpliesLeafFramePointer(const llvm::opt::ArgList &Args, clang::CodeGenOptions::FramePointerKind getFramePointerKind(const llvm::opt::ArgList &Args, const llvm::Triple &Triple) { - // There are three things to consider here: + // There are four things to consider here: // * Should a frame record be created for non-leaf functions? // * Should a frame record be created for leaf functions? - // * Is the frame pointer register reserved, i.e. must it always point to - // either a new, valid frame record or be un-modified? + // * Is the frame pointer register reserved in non-leaf functions? + // i.e. must it always point to either a new, valid frame record or be + // un-modified? + // * Is the frame pointer register reserved in leaf functions? // // Not all combinations of these are valid: // * It's not useful to have leaf frame records without non-leaf ones. // * It's not useful to have frame records without reserving the frame // pointer. // - // | Non-leaf | Leaf | Reserved | - // | N | N | N | FramePointerKind::None - // | N | N | Y | FramePointerKind::Reserved - // | N | Y | N | Invalid - // | N | Y | Y | Invalid - // | Y | N | N | Invalid - // | Y | N | Y | FramePointerKind::NonLeaf - // | Y | Y | N | Invalid - // | Y | Y | Y | FramePointerKind::All + // | Frame Setup | Reg Reserved | + // |-----------------|-----------------| + // | Non-leaf | Leaf | Non-Leaf | Leaf | + // |----------|------|----------|------| + // | N | N | N | N | FramePointerKind::None + // | N | N | N | Y | Invalid + // | N | N | Y | N | Invalid + // | N | N | Y | Y | FramePointerKind::Reserved + // | N | Y | N | N | Invalid + // | N | Y | N | Y | Invalid + // | N | Y | Y | N | Invalid + // | N | Y | Y | Y | Invalid + // | Y | N | N | N | Invalid + // | Y | N | N | Y | Invalid + // | Y | N | Y | N | FramePointerKind::NonLeafNoReserve + // | Y | N | Y | Y | FramePointerKind::NonLeaf + // | Y | Y | N | N | Invalid + // | Y | Y | N | Y | Invalid + // | Y | Y | Y | N | Invalid + // | Y | Y | Y | Y | FramePointerKind::All // // The FramePointerKind::Reserved case is currently only reachable for Arm, // which has the -mframe-chain= option which can (in combination with @@ -278,12 +291,18 @@ getFramePointerKind(const llvm::opt::ArgList &Args, Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer, options::OPT_momit_leaf_frame_pointer, DefaultLeafFP); - bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple); + bool FPRegReserved = Args.hasFlag(options::OPT_mreserve_frame_pointer_reg, + options::OPT_mno_reserve_frame_pointer_reg, + mustMaintainValidFrameChain(Args, Triple)); if (EnableFP) { if (EnableLeafFP) return clang::CodeGenOptions::FramePointerKind::All; - return clang::CodeGenOptions::FramePointerKind::NonLeaf; + + if (FPRegReserved) + return clang::CodeGenOptions::FramePointerKind::NonLeaf; + + return clang::CodeGenOptions::FramePointerKind::NonLeafNoReserve; } if (FPRegReserved) return clang::CodeGenOptions::FramePointerKind::Reserved; diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 8d53547f8ab66..7c20b241d0dd8 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -1088,6 +1088,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA, case CodeGenOptions::FramePointerKind::Reserved: FPKeepKindStr = "-mframe-pointer=reserved"; break; + case CodeGenOptions::FramePointerKind::NonLeafNoReserve: + FPKeepKindStr = "-mframe-pointer=non-leaf-no-reserve"; + break; case CodeGenOptions::FramePointerKind::NonLeaf: FPKeepKindStr = "-mframe-pointer=non-leaf"; break; diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 5fcb659768655..8688ccf41acb5 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -2613,7 +2613,7 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes( } PreferredType.enterVariableInit(Tok.getLocation(), ThisDecl); - ExprResult Init = ParseInitializer(); + ExprResult Init = ParseInitializer(ThisDecl); // If this is the only decl in (possibly) range based for statement, // our best guess is that the user meant ':' instead of '='. diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index b96968d4592f5..d8ed7e3ff96bd 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -3359,7 +3359,7 @@ ExprResult Parser::ParseCXXMemberInitializer(Decl *D, bool IsFunction, Diag(Tok, diag::err_ms_property_initializer) << PD; return ExprError(); } - return ParseInitializer(); + return ParseInitializer(D); } void Parser::SkipCXXMemberSpecification(SourceLocation RecordLoc, diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp index a3be3744a9327..0e86c4c48d5e4 100644 --- a/clang/lib/Parse/ParseInit.cpp +++ b/clang/lib/Parse/ParseInit.cpp @@ -581,3 +581,26 @@ bool Parser::ParseMicrosoftIfExistsBraceInitializer(ExprVector &InitExprs, return !trailingComma; } + +ExprResult Parser::ParseInitializer(Decl *DeclForInitializer) { + // Set DeclForInitializer for file-scope variables. + // For constexpr references, set it to suppress runtime warnings. + // For non-constexpr references, don't set it to avoid evaluation issues + // with self-referencing initializers. Local variables (including local + // constexpr) should emit runtime warnings. + if (DeclForInitializer && !Actions.ExprEvalContexts.empty()) { + if (auto *VD = dyn_cast(DeclForInitializer); + VD && VD->isFileVarDecl() && + (!VD->getType()->isReferenceType() || VD->isConstexpr())) + Actions.ExprEvalContexts.back().DeclForInitializer = VD; + } + + ExprResult init; + if (Tok.isNot(tok::l_brace)) { + init = ParseAssignmentExpression(); + } else { + init = ParseBraceInitializer(); + } + + return init; +} diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 334438edfc2e8..32a406e2c065f 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -339,7 +339,7 @@ void Parser::ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm) { } PreferredType.enterVariableInit(Tok.getLocation(), OmpPrivParm); - ExprResult Init = ParseInitializer(); + ExprResult Init = ParseInitializer(OmpPrivParm); if (Init.isInvalid()) { SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 9ae9a001cf3be..d58eac19033ed 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2723,6 +2723,70 @@ static void flushDiagnostics(Sema &S, const sema::FunctionScopeInfo *fscope) { S.Diag(D.Loc, D.PD); } +template +static void emitPossiblyUnreachableDiags(Sema &S, AnalysisDeclContext &AC, + std::pair PUDs) { + + if (PUDs.first == PUDs.second) + return; + + for (auto I = PUDs.first; I != PUDs.second; ++I) { + for (const Stmt *S : I->Stmts) + AC.registerForcedBlockExpression(S); + } + + if (AC.getCFG()) { + CFGReverseBlockReachabilityAnalysis *Analysis = + AC.getCFGReachablityAnalysis(); + + for (auto I = PUDs.first; I != PUDs.second; ++I) { + const auto &D = *I; + if (llvm::all_of(D.Stmts, [&](const Stmt *St) { + const CFGBlock *Block = AC.getBlockForRegisteredExpression(St); + // FIXME: We should be able to assert that block is non-null, but + // the CFG analysis can skip potentially-evaluated expressions in + // edge cases; see test/Sema/vla-2.c. + if (Block && Analysis) + if (!Analysis->isReachable(&AC.getCFG()->getEntry(), Block)) + return false; + return true; + })) { + S.Diag(D.Loc, D.PD); + } + } + } else { + for (auto I = PUDs.first; I != PUDs.second; ++I) + S.Diag(I->Loc, I->PD); + } +} + +void sema::AnalysisBasedWarnings::registerVarDeclWarning( + VarDecl *VD, clang::sema::PossiblyUnreachableDiag PUD) { + VarDeclPossiblyUnreachableDiags.emplace(VD, PUD); +} + +void sema::AnalysisBasedWarnings::issueWarningsForRegisteredVarDecl( + VarDecl *VD) { + if (!llvm::is_contained(VarDeclPossiblyUnreachableDiags, VD)) + return; + + AnalysisDeclContext AC(/*Mgr=*/nullptr, VD); + + AC.getCFGBuildOptions().PruneTriviallyFalseEdges = true; + AC.getCFGBuildOptions().AddEHEdges = false; + AC.getCFGBuildOptions().AddInitializers = true; + AC.getCFGBuildOptions().AddImplicitDtors = true; + AC.getCFGBuildOptions().AddTemporaryDtors = true; + AC.getCFGBuildOptions().AddCXXNewAllocator = false; + AC.getCFGBuildOptions().AddCXXDefaultInitExprInCtors = true; + + auto Range = VarDeclPossiblyUnreachableDiags.equal_range(VD); + auto SecondRange = + llvm::make_second_range(llvm::make_range(Range.first, Range.second)); + emitPossiblyUnreachableDiags( + S, AC, std::make_pair(SecondRange.begin(), SecondRange.end())); +} + // An AST Visitor that calls a callback function on each callable DEFINITION // that is NOT in a dependent context: class CallableVisitor : public DynamicRecursiveASTVisitor { @@ -2934,45 +2998,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( } // Emit delayed diagnostics. - if (!fscope->PossiblyUnreachableDiags.empty()) { - bool analyzed = false; - - // Register the expressions with the CFGBuilder. - for (const auto &D : fscope->PossiblyUnreachableDiags) { - for (const Stmt *S : D.Stmts) - AC.registerForcedBlockExpression(S); - } - - if (AC.getCFG()) { - analyzed = true; - for (const auto &D : fscope->PossiblyUnreachableDiags) { - bool AllReachable = true; - for (const Stmt *S : D.Stmts) { - const CFGBlock *block = AC.getBlockForRegisteredExpression(S); - CFGReverseBlockReachabilityAnalysis *cra = - AC.getCFGReachablityAnalysis(); - // FIXME: We should be able to assert that block is non-null, but - // the CFG analysis can skip potentially-evaluated expressions in - // edge cases; see test/Sema/vla-2.c. - if (block && cra) { - // Can this block be reached from the entrance? - if (!cra->isReachable(&AC.getCFG()->getEntry(), block)) { - AllReachable = false; - break; - } - } - // If we cannot map to a basic block, assume the statement is - // reachable. - } - - if (AllReachable) - S.Diag(D.Loc, D.PD); - } - } - - if (!analyzed) - flushDiagnostics(S, fscope); - } + auto &PUDs = fscope->PossiblyUnreachableDiags; + emitPossiblyUnreachableDiags(S, AC, std::make_pair(PUDs.begin(), PUDs.end())); // Warning: check missing 'return' if (P.enableCheckFallThrough) { diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 60e060766dc8f..0ab408cd060c8 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -59,6 +59,7 @@ #include "clang/Sema/SemaWasm.h" #include "clang/Sema/Template.h" #include "llvm/ADT/STLForwardCompat.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" @@ -13117,6 +13118,13 @@ namespace { if (isa(OrigDecl)) return; + // Skip checking for file-scope constexpr variables - constant evaluation + // will produce appropriate errors without needing runtime diagnostics. + // Local constexpr should still emit runtime warnings. + if (auto *VD = dyn_cast(OrigDecl); + VD && VD->isConstexpr() && VD->isFileVarDecl()) + return; + E = E->IgnoreParens(); // Skip checking T a = a where T is not a record or reference type. @@ -13744,6 +13752,11 @@ void Sema::DiagnoseUniqueObjectDuplication(const VarDecl *VD) { } void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { + auto ResetDeclForInitializer = llvm::make_scope_exit([this]() { + if (this->ExprEvalContexts.empty()) + this->ExprEvalContexts.back().DeclForInitializer = nullptr; + }); + // If there is no declaration, there was an error parsing it. Just ignore // the initializer. if (!RealDecl) { @@ -15096,6 +15109,10 @@ void Sema::FinalizeDeclaration(Decl *ThisDecl) { if (!VD) return; + // Emit any deferred warnings for the variable's initializer, even if the + // variable is invalid + AnalysisWarnings.issueWarningsForRegisteredVarDecl(VD); + // Apply an implicit SectionAttr if '#pragma clang section bss|data|rodata' is active if (VD->hasGlobalStorage() && VD->isThisDeclarationADefinition() && !inTemplateInstantiation() && !VD->hasAttr()) { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index d99994ac3b8b8..1a2fe0b3c17a7 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -20635,31 +20635,36 @@ void Sema::MarkDeclarationsReferencedInExpr(Expr *E, } /// Emit a diagnostic when statements are reachable. -/// FIXME: check for reachability even in expressions for which we don't build a -/// CFG (eg, in the initializer of a global or in a constant expression). -/// For example, -/// namespace { auto *p = new double[3][false ? (1, 2) : 3]; } bool Sema::DiagIfReachable(SourceLocation Loc, ArrayRef Stmts, const PartialDiagnostic &PD) { - if (!Stmts.empty() && getCurFunctionOrMethodDecl()) { - if (!FunctionScopes.empty()) - FunctionScopes.back()->PossiblyUnreachableDiags.push_back( - sema::PossiblyUnreachableDiag(PD, Loc, Stmts)); - return true; - } - + VarDecl *Decl = ExprEvalContexts.back().DeclForInitializer; // The initializer of a constexpr variable or of the first declaration of a // static data member is not syntactically a constant evaluated constant, // but nonetheless is always required to be a constant expression, so we // can skip diagnosing. - // FIXME: Using the mangling context here is a hack. - if (auto *VD = dyn_cast_or_null( - ExprEvalContexts.back().ManglingContextDecl)) { - if (VD->isConstexpr() || - (VD->isStaticDataMember() && VD->isFirstDecl() && !VD->isInline())) - return false; - // FIXME: For any other kind of variable, we should build a CFG for its - // initializer and check whether the context in question is reachable. + if (Decl && + (Decl->isConstexpr() || (Decl->isStaticDataMember() && + Decl->isFirstDecl() && !Decl->isInline()))) + return false; + + if (Stmts.empty()) { + Diag(Loc, PD); + return true; + } + + if (getCurFunction()) { + FunctionScopes.back()->PossiblyUnreachableDiags.push_back( + sema::PossiblyUnreachableDiag(PD, Loc, Stmts)); + return true; + } + + // For non-constexpr file-scope variables with reachability context (non-empty + // Stmts), build a CFG for the initializer and check whether the context in + // question is reachable. + if (Decl && Decl->isFileVarDecl()) { + AnalysisWarnings.registerVarDeclWarning( + Decl, sema::PossiblyUnreachableDiag(PD, Loc, Stmts)); + return true; } Diag(Loc, PD); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index eb2d4a566df91..17362fa5fab4f 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -17762,45 +17762,101 @@ OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause( << getOpenMPClauseNameForDiag(OMPC_default); return nullptr; } - - switch (M) { - case OMP_DEFAULT_none: - DSAStack->setDefaultDSANone(MLoc); - break; - case OMP_DEFAULT_shared: - DSAStack->setDefaultDSAShared(MLoc); - break; - case OMP_DEFAULT_firstprivate: - DSAStack->setDefaultDSAFirstPrivate(MLoc); - break; - case OMP_DEFAULT_private: - DSAStack->setDefaultDSAPrivate(MLoc); - break; - default: - llvm_unreachable("DSA unexpected in OpenMP default clause"); - } - - switch (VCKind) { - case OMPC_DEFAULT_VC_aggregate: - DSAStack->setDefaultDSAVCAggregate(VCKindLoc); - break; - case OMPC_DEFAULT_VC_all: - DSAStack->setDefaultDSAVCAll(VCKindLoc); - break; - case OMPC_DEFAULT_VC_allocatable: - DSAStack->setDefaultDSAVCAllocatable(VCKindLoc); - break; - case OMPC_DEFAULT_VC_pointer: - DSAStack->setDefaultDSAVCPointer(VCKindLoc); - break; - case OMPC_DEFAULT_VC_scalar: - DSAStack->setDefaultDSAVCScalar(VCKindLoc); - break; - default: + if (VCKind == OMPC_DEFAULT_VC_unknown) { Diag(VCKindLoc, diag::err_omp_default_vc) << getOpenMPSimpleClauseTypeName(OMPC_default, unsigned(M)); + return nullptr; } + bool IsTargetDefault = + getLangOpts().OpenMP >= 60 && + isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()); + + // OpenMP 6.0, page 224, lines 3-4 default Clause, Semantics + // If data-sharing-attribute is shared then the clause has no effect + // on a target construct; + if (IsTargetDefault && M == OMP_DEFAULT_shared) + return nullptr; + + auto SetDefaultClauseAttrs = [&](llvm::omp::DefaultKind M, + OpenMPDefaultClauseVariableCategory VCKind) { + OpenMPDefaultmapClauseModifier DefMapMod; + OpenMPDefaultmapClauseKind DefMapKind; + // default data-sharing-attribute + switch (M) { + case OMP_DEFAULT_none: + if (IsTargetDefault) + DefMapMod = OMPC_DEFAULTMAP_MODIFIER_none; + else + DSAStack->setDefaultDSANone(MLoc); + break; + case OMP_DEFAULT_firstprivate: + if (IsTargetDefault) + DefMapMod = OMPC_DEFAULTMAP_MODIFIER_firstprivate; + else + DSAStack->setDefaultDSAFirstPrivate(MLoc); + break; + case OMP_DEFAULT_private: + if (IsTargetDefault) + DefMapMod = OMPC_DEFAULTMAP_MODIFIER_private; + else + DSAStack->setDefaultDSAPrivate(MLoc); + break; + case OMP_DEFAULT_shared: + assert(!IsTargetDefault && "DSA shared invalid with target directive"); + DSAStack->setDefaultDSAShared(MLoc); + break; + default: + llvm_unreachable("unexpected DSA in OpenMP default clause"); + } + // default variable-category + switch (VCKind) { + case OMPC_DEFAULT_VC_aggregate: + if (IsTargetDefault) + DefMapKind = OMPC_DEFAULTMAP_aggregate; + else + DSAStack->setDefaultDSAVCAggregate(VCKindLoc); + break; + case OMPC_DEFAULT_VC_pointer: + if (IsTargetDefault) + DefMapKind = OMPC_DEFAULTMAP_pointer; + else + DSAStack->setDefaultDSAVCPointer(VCKindLoc); + break; + case OMPC_DEFAULT_VC_scalar: + if (IsTargetDefault) + DefMapKind = OMPC_DEFAULTMAP_scalar; + else + DSAStack->setDefaultDSAVCScalar(VCKindLoc); + break; + case OMPC_DEFAULT_VC_all: + if (IsTargetDefault) + DefMapKind = OMPC_DEFAULTMAP_all; + else + DSAStack->setDefaultDSAVCAll(VCKindLoc); + break; + default: + llvm_unreachable("unexpected variable category in OpenMP default clause"); + } + // OpenMP 6.0, page 224, lines 4-5 default Clause, Semantics + // otherwise, its effect on a target construct is equivalent to + // specifying the defaultmap clause with the same data-sharing-attribute + // and variable-category. + // + // If earlier than OpenMP 6.0, or not a target directive, the default DSA + // is/was set as before. + if (IsTargetDefault) { + if (DefMapKind == OMPC_DEFAULTMAP_all) { + DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_aggregate, MLoc); + DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_scalar, MLoc); + DSAStack->setDefaultDMAAttr(DefMapMod, OMPC_DEFAULTMAP_pointer, MLoc); + } else { + DSAStack->setDefaultDMAAttr(DefMapMod, DefMapKind, MLoc); + } + } + }; + + SetDefaultClauseAttrs(M, VCKind); return new (getASTContext()) OMPDefaultClause(M, MLoc, VCKind, VCKindLoc, StartLoc, LParenLoc, EndLoc); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 4d58f00168298..a56017cd7b7e7 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -6198,6 +6198,10 @@ void Sema::InstantiateVariableInitializer( currentEvaluationContext().RebuildDefaultArgOrDefaultInit = parentEvaluationContext().RebuildDefaultArgOrDefaultInit; + // Set DeclForInitializer for this variable so DiagIfReachable can properly + // suppress runtime diagnostics for constexpr/static member variables + currentEvaluationContext().DeclForInitializer = Var; + if (OldVar->getInit()) { // Instantiate the initializer. ExprResult Init = @@ -6467,6 +6471,8 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation, PassToConsumerRAII.Var = Var; Var->setTemplateSpecializationKind(OldVar->getTemplateSpecializationKind(), OldVar->getPointOfInstantiation()); + // Emit any deferred warnings for the variable's initializer + AnalysisWarnings.issueWarningsForRegisteredVarDecl(Var); } // This variable may have local implicit instantiations that need to be diff --git a/clang/test/Analysis/analyzeOneFunction.cpp b/clang/test/Analysis/analyzeOneFunction.cpp index 3a362dfd9a08c..b2257570f6052 100644 --- a/clang/test/Analysis/analyzeOneFunction.cpp +++ b/clang/test/Analysis/analyzeOneFunction.cpp @@ -5,9 +5,9 @@ // RUN: -analyze-function="c:@S@Window@F@overloaded#I#" // RUN: %clang_extdef_map %s | FileCheck %s -// CHECK: 27:c:@S@Window@F@overloaded#I# -// CHECK-NEXT: 27:c:@S@Window@F@overloaded#C# -// CHECK-NEXT: 27:c:@S@Window@F@overloaded#d# +// CHECK-DAG: 27:c:@S@Window@F@overloaded#I# +// CHECK-DAG: 27:c:@S@Window@F@overloaded#C# +// CHECK-DAG: 27:c:@S@Window@F@overloaded#d# void clang_analyzer_warnIfReached(); diff --git a/clang/test/CodeGenSPIRV/spirv-intel.c b/clang/test/CodeGenSPIRV/spirv-intel.c index 997cd6f10b90c..f00fc97adaec7 100644 --- a/clang/test/CodeGenSPIRV/spirv-intel.c +++ b/clang/test/CodeGenSPIRV/spirv-intel.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITH-64 %s -// RUN: %clang_cc1 -triple spirv32-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITH-32 %s +// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK-WITH,CHECK-WITH-64 %s +// RUN: %clang_cc1 -triple spirv32-intel %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK-WITH,CHECK-WITH-32 %s // RUN: %clang_cc1 -triple spir-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITHOUT %s // RUN: %clang_cc1 -triple spir64-intel %s -emit-llvm -o - | FileCheck -check-prefix=CHECK-WITHOUT %s @@ -9,3 +9,11 @@ // CHECK-WITHOUT: spir_func void @foo(ptr noundef %param) #0 { void foo(int *param) { } + +typedef __attribute__((address_space(9))) void * FnPtrTy; + +// CHECK-WITH: %{{.*}} = icmp eq ptr addrspace(9) %{{.*}}, null +int bar() { + FnPtrTy FnPtr = (FnPtrTy)foo; + return FnPtr == 0; +} diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c index 6d719828c6a06..e68fbf529643e 100644 --- a/clang/test/Driver/frame-pointer-elim.c +++ b/clang/test/Driver/frame-pointer-elim.c @@ -2,6 +2,8 @@ // KEEP-ALL: "-mframe-pointer=all" // KEEP-NON-LEAF-NOT: warning: argument unused // KEEP-NON-LEAF: "-mframe-pointer=non-leaf" +// KEEP-NON-LEAF-NO-RESERVE-NOT: warning: argument unused +// KEEP-NON-LEAF-NO-RESERVE: "-mframe-pointer=non-leaf-no-reserve" // KEEP-NONE-NOT: warning: argument unused // KEEP-NONE: "-mframe-pointer=none" // KEEP-RESERVED-NOT: warning: argument unused @@ -24,19 +26,27 @@ // -momit-leaf-frame-pointer omits leaf frame pointer. // -fno-omit-frame-pointer loses out to -momit-leaf-frame-pointer. // RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=i386-linux -S -O1 -momit-leaf-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-NONE %s +// -momit-leaf-frame-pointer -mreserve-frame-pointer-reg results in the frame pointer reg being reserved +// RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer -mreserve-frame-pointer-reg %s 2>&1 | \ +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s + +// -fomit-frame-pointer -mreserve-frame-pointer-reg results in the frame pointer reg being reserved +// RUN: %clang -### --target=i386 -S -fomit-frame-pointer -mreserve-frame-pointer-reg %s 2>&1 | \ +// RUN: FileCheck --check-prefix=KEEP-RESERVED %s + // fno-omit-frame-pointer -momit-leaf-frame-pointer can be overwritten by // fomit-frame-pointer later on the command without warning // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer -fomit-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-NONE %s // RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // Explicit or default -fomit-frame-pointer wins over -mno-omit-leaf-frame-pointer. // RUN: %clang -### --target=i386 -S %s -fomit-frame-pointer -mno-omit-leaf-frame-pointer 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-NONE %s @@ -68,45 +78,45 @@ // RUN: FileCheck --check-prefix=KEEP-NONE %s // RUN: %clang -### --target=i386-darwin -S -momit-leaf-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### -target armv7s-apple-ios -fomit-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=WARN-OMIT-7S %s // WARN-OMIT-7S: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7s' -// WARN-OMIT-7S: "-mframe-pointer=non-leaf" +// WARN-OMIT-7S: "-mframe-pointer=non-leaf-no-reserve" // RUN: %clang -### -target armv7k-apple-watchos -fomit-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=WARN-OMIT-7K %s // WARN-OMIT-7K: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7k' -// WARN-OMIT-7K: "-mframe-pointer=non-leaf" +// WARN-OMIT-7K: "-mframe-pointer=non-leaf-no-reserve" // RUN: %clang -### -target armv7s-apple-ios8.0 -momit-leaf-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=WARN-OMIT-LEAF-7S %s // WARN-OMIT-LEAF-7S-NOT: warning: optimization flag '-momit-leaf-frame-pointer' is not supported for target 'armv7s' -// WARN-OMIT-LEAF-7S: "-mframe-pointer=non-leaf" +// WARN-OMIT-LEAF-7S: "-mframe-pointer=non-leaf-no-reserve" // On AArch64, PS4, PS5, and VE, default to omitting the frame pointer on leaf // functions // RUN: %clang -### --target=aarch64 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=x86_64-scei-ps4 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=x86_64-scei-ps4 -S -O2 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=x86_64-sie-ps5 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=x86_64-sie-ps5 -S -O2 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### -target aarch64-apple-darwin -arch arm64_32 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=ve-unknown-linux-gnu -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-linux-android -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-linux-android -S -O2 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-linux-android -S -Os %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=powerpc64 -S %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-ALL %s @@ -161,9 +171,9 @@ // RUN: %clang -### --target=armv7a-linux-androideabi- -mthumb -mbig-endian -O1 -S %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-ALL %s // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // On ARM backend bare metal targets, frame pointer is omitted // RUN: %clang -### --target=arm-arm-none-eabi -S %s 2>&1 | \ @@ -191,21 +201,21 @@ // Check that for Apple bare metal targets, we're keeping frame pointers by default // RUN: %clang -### --target=armv6m-apple-none-macho -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=armv6m-apple-none-macho -S -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=arm-apple-none-macho -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=arm-apple-none-macho -S -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=armv6m-apple-none-macho -S -O1 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=armv6m-apple-none-macho -S -O1 -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=arm-apple-none-macho -S -O1 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=arm-apple-none-macho -S -O1 -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang --target=armv7-apple-macho -### -S %s 2>&1 \ // RUN: -fomit-frame-pointer \ @@ -221,17 +231,22 @@ // AArch64 bare metal targets behave like hosted targets // RUN: %clang -### --target=aarch64-none-elf -S %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-none-elf -S -O1 %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-none-elf -S -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // RUN: %clang -### --target=aarch64-none-elf -S -O1 -fno-omit-frame-pointer %s 2>&1 | \ -// RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s +// RUN: FileCheck --check-prefix=KEEP-NON-LEAF-NO-RESERVE %s // AArch64 Windows requires that the frame pointer be reserved // RUN: %clang -### --target=aarch64-pc-windows-msvc -S -fomit-frame-pointer %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-RESERVED %s +// -mno-reserve-frame-pointer-reg overrides platform defaults +// But -mno-reserve-frame-pointer-reg should override the target platform default +// RUN: %clang -### --target=aarch64-pc-windows-msvc -S -fomit-frame-pointer -mno-reserve-frame-pointer-reg %s 2>&1 | \ +// RUN: FileCheck --check-prefix=KEEP-NONE %s + void f0() {} void f1() { f0(); } diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c index 99e5018117924..0cf7535d14bd5 100644 --- a/clang/test/Driver/fuchsia.c +++ b/clang/test/Driver/fuchsia.c @@ -77,7 +77,7 @@ // RUN: %clang -### %s --target=aarch64-unknown-fuchsia -O3 2>&1 \ // RUN: | FileCheck %s -check-prefix=CHECK-FP-NONE // CHECK-FP-ALL: "-mframe-pointer=all" -// CHECK-FP-NONLEAF: "-mframe-pointer=non-leaf" +// CHECK-FP-NONLEAF: "-mframe-pointer=non-leaf-no-reserve" // CHECK-FP-NONE: "-mframe-pointer=none" // RUN: not %clang -### %s --target=x86_64-unknown-fuchsia -rtlib=libgcc 2>&1 \ diff --git a/clang/test/OpenMP/target_default_codegen.cpp b/clang/test/OpenMP/target_default_codegen.cpp new file mode 100644 index 0000000000000..eadd0e57945b1 --- /dev/null +++ b/clang/test/OpenMP/target_default_codegen.cpp @@ -0,0 +1,2020 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5 +// expected-no-diagnostics + +// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -Wno-vla -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CK-64 +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CK-64 +// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -Wno-vla -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CK-32 +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix CK-32 + +// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY-64 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY-64 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -Wno-vla -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY-32 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=60 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify -Wno-vla %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY-32 %s + +#ifndef HEADER +#define HEADER +void foo1(int a) { + double d = (double)a; + + #pragma omp target default(private: scalar) + { + d += 1.0; + } +} + +void foo2() { + int pvtArr[10]; + + #pragma omp target default(private: aggregate) + { + pvtArr[5]++; + } +} + +void foo3() { + int *pa; + + #pragma omp target default(private: pointer) + { + pa[50]++; + } +} + +// Specified variable-category doesn't apply to referenced variable, so +// normal implicitly determined data-sharing applies. +void foo4() { + int p; + + #pragma omp target default(private: pointer) + { + p++; + } +} + +// Verify default clause with variable-category 'all' is equivalent to no +// variable-category. IR checks generated with 'all' but test runs without +// variable-category. +void foo5(int a) { + double d = (double)a; + int pvtArr[10]; + int *pa; + + #pragma omp target default(private) + { + d += 1.0; + pvtArr[5]++; + pa[50]++; + } +} + +// Verify default clause with 'shared' DSA is ignored. This makes it +// equivalent to target with no default clause. IR checks generated with +// no default clause but test runs with default 'shared'. +void foo6(int a) { + double d = (double)a; + int pvtArr[10]; + int *pa; + + #pragma omp target default(shared) + { + d += 1.0; + pvtArr[5]++; + pa[50]++; + } +} + +// Verify default clause with 'firstprivate' DSA is equivalent to specifying +// defaultmap with 'firstprivate'. IR checks generated with +// defaultmap(firstprivate) but test runs with default(firstprivate). +void foo7(int a) { + double d = (double)a; + int pvtArr[10]; + int *pa; + + #pragma omp target default(firstprivate) + { + d += 1.0; + pvtArr[5]++; + pa[50]++; + } +} + +// Verify 'default' clause on a combined 'target' directive is equivalent to +// specifying its constituent directives with 'default' clauses. IR checks +// generated with constituent directives but test runs with combined +// directive. +void foo8() { + int x = 0; + #pragma omp target teams distribute parallel for default(firstprivate) firstprivate(x) + for (int i=0; i<10; i++) + x += 1; +} +#endif // HEADER +// CK-64-LABEL: define dso_local void @_Z4foo1i( +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[D_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// CK-64-NEXT: store double [[TMP1]], ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP3]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP5]], align 8 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP8]], align 4 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP9]], align 4 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP15]], align 8 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP17]], align 8 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP18]], align 4 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4 +// CK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP20]], align 4 +// CK-64-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// CK-64-NEXT: br i1 [[TMP22]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(i64 [[TMP2]]) #[[ATTR2:[0-9]+]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23( +// CK-64-SAME: i64 [[D:%.*]]) #[[ATTR1:[0-9]+]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-64-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load double, ptr [[D1]], align 8 +// CK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP0]], 1.000000e+00 +// CK-64-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo2v( +// CK-64-SAME: ) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP0]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP1]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP2]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP5]], align 4 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP6]], align 4 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP17]], align 4 +// CK-64-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 +// CK-64-NEXT: br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(ptr [[PVTARR]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32( +// CK-64-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19:![0-9]+]], !align [[META20:![0-9]+]] +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5 +// CK-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo3v( +// CK-64-SAME: ) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store ptr [[TMP0]], ptr [[TMP1]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store ptr [[TMP0]], ptr [[TMP2]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP3]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP6]], align 4 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP7]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.3, ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP18]], align 4 +// CK-64-NEXT: [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CK-64-NEXT: br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(ptr [[TMP0]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41( +// CK-64-SAME: ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PA1:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8 +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50 +// CK-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo4v( +// CK-64-SAME: ) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[P:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[P_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +// CK-64-NEXT: store i32 [[TMP0]], ptr [[P_CASTED]], align 4 +// CK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[P_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP1]], ptr [[TMP3]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP7]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP8]], align 4 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.5, ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP19]], align 4 +// CK-64-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +// CK-64-NEXT: br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(i64 [[TMP1]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52( +// CK-64-SAME: i64 [[P:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[P_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: store i64 [[P]], ptr [[P_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[P_ADDR]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo5i( +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[D_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// CK-64-NEXT: store double [[TMP1]], ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP5]], align 8 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP6]], align 8 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP7]], align 8 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CK-64-NEXT: store ptr null, ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP15]], align 4 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 3, ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP13]], ptr [[TMP17]], align 8 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP14]], ptr [[TMP18]], align 8 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.7, ptr [[TMP19]], align 8 +// CK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP20]], align 8 +// CK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP21]], align 8 +// CK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP22]], align 8 +// CK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP23]], align 8 +// CK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP24]], align 8 +// CK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4 +// CK-64-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4 +// CK-64-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP27]], align 4 +// CK-64-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CK-64-NEXT: br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66( +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[PA3:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]] +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// CK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// CK-64-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i64 0, i64 5 +// CK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 8 +// CK-64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +// CK-64-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-64-NEXT: store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo6i( +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[D_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// CK-64-NEXT: store double [[TMP1]], ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP5]], align 8 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP6]], align 8 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP7]], align 8 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CK-64-NEXT: store ptr null, ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP15]], align 4 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 3, ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP13]], ptr [[TMP17]], align 8 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP14]], ptr [[TMP18]], align 8 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.9, ptr [[TMP19]], align 8 +// CK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.10, ptr [[TMP20]], align 8 +// CK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP21]], align 8 +// CK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP22]], align 8 +// CK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP23]], align 8 +// CK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP24]], align 8 +// CK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4 +// CK-64-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4 +// CK-64-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP27]], align 4 +// CK-64-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CK-64-NEXT: br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82( +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]] +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// CK-64-NEXT: store double [[ADD]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 5 +// CK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// CK-64-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-64-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo7i( +// CK-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[D_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// CK-64-NEXT: store double [[TMP1]], ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = load i64, ptr [[D_CASTED]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP2]], ptr [[TMP5]], align 8 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP6]], align 8 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP7]], align 8 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[TMP8]], align 8 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CK-64-NEXT: store ptr null, ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP3]], ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CK-64-NEXT: store ptr null, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP15]], align 4 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 3, ptr [[TMP16]], align 4 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP13]], ptr [[TMP17]], align 8 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP14]], ptr [[TMP18]], align 8 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.11, ptr [[TMP19]], align 8 +// CK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.12, ptr [[TMP20]], align 8 +// CK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP21]], align 8 +// CK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP22]], align 8 +// CK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 0, ptr [[TMP23]], align 8 +// CK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP24]], align 8 +// CK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP25]], align 4 +// CK-64-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4 +// CK-64-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP27]], align 4 +// CK-64-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CK-64-NEXT: br i1 [[TMP29]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(i64 [[TMP2]], ptr [[PVTARR]], ptr [[TMP3]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98( +// CK-64-SAME: i64 [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// CK-64-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 8 +// CK-64-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 8, !nonnull [[META19]], !align [[META20]] +// CK-64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[PVTARR1]], ptr align 4 [[TMP0]], i64 40, i1 false) +// CK-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// CK-64-NEXT: store double [[ADD]], ptr [[D_ADDR]], align 8 +// CK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5 +// CK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// CK-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA_ADDR]], align 8 +// CK-64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +// CK-64-NEXT: [[INC3:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-64-NEXT: store i32 [[INC3]], ptr [[ARRAYIDX2]], align 4 +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define dso_local void @_Z4foo8v( +// CK-64-SAME: ) #[[ATTR0]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[X:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[X_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-64-NEXT: store i32 0, ptr [[X]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4 +// CK-64-NEXT: store i32 [[TMP0]], ptr [[X_CASTED]], align 4 +// CK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[X_CASTED]], align 8 +// CK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: store i64 [[TMP1]], ptr [[TMP3]], align 8 +// CK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CK-64-NEXT: store ptr null, ptr [[TMP4]], align 8 +// CK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-64-NEXT: store i32 3, ptr [[TMP7]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-64-NEXT: store i32 1, ptr [[TMP8]], align 4 +// CK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-64-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8 +// CK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-64-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8 +// CK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-64-NEXT: store ptr @.offload_sizes.13, ptr [[TMP11]], align 8 +// CK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-64-NEXT: store ptr @.offload_maptypes.14, ptr [[TMP12]], align 8 +// CK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-64-NEXT: store ptr null, ptr [[TMP13]], align 8 +// CK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-64-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-64-NEXT: store i64 10, ptr [[TMP15]], align 8 +// CK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-64-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4 +// CK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-64-NEXT: store i32 0, ptr [[TMP19]], align 4 +// CK-64-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.region_id, ptr [[KERNEL_ARGS]]) +// CK-64-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +// CK-64-NEXT: br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-64: [[OMP_OFFLOAD_FAILED]]: +// CK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(i64 [[TMP1]]) #[[ATTR2]] +// CK-64-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-64: [[OMP_OFFLOAD_CONT]]: +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112( +// CK-64-SAME: i64 [[X:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[X_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[X_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: store i64 [[X]], ptr [[X_ADDR]], align 8 +// CK-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-64-NEXT: store i32 [[TMP0]], ptr [[X_CASTED]], align 4 +// CK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[X_CASTED]], align 8 +// CK-64-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined, i64 [[TMP1]]) +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined( +// CK-64-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[X:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[X_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[I:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[X_CASTED:%.*]] = alloca i64, align 8 +// CK-64-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CK-64-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CK-64-NEXT: store i64 [[X]], ptr [[X_ADDR]], align 8 +// CK-64-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-64-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CK-64-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CK-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CK-64-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CK-64-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CK-64: [[COND_TRUE]]: +// CK-64-NEXT: br label %[[COND_END:.*]] +// CK-64: [[COND_FALSE]]: +// CK-64-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: br label %[[COND_END]] +// CK-64: [[COND_END]]: +// CK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ] +// CK-64-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-64-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CK-64: [[OMP_INNER_FOR_COND]]: +// CK-64-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CK-64-NEXT: br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CK-64: [[OMP_INNER_FOR_BODY]]: +// CK-64-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CK-64-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-64-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +// CK-64-NEXT: [[TMP11:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-64-NEXT: store i32 [[TMP11]], ptr [[X_CASTED]], align 4 +// CK-64-NEXT: [[TMP12:%.*]] = load i64, ptr [[X_CASTED]], align 8 +// CK-64-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]], i64 [[TMP12]]) +// CK-64-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CK-64: [[OMP_INNER_FOR_INC]]: +// CK-64-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4 +// CK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]] +// CK-64-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CK-64: [[OMP_INNER_FOR_END]]: +// CK-64-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CK-64: [[OMP_LOOP_EXIT]]: +// CK-64-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]]) +// CK-64-NEXT: ret void +// +// +// CK-64-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined( +// CK-64-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]], i64 [[X:%.*]]) #[[ATTR1]] { +// CK-64-NEXT: [[ENTRY:.*:]] +// CK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CK-64-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[X_ADDR:%.*]] = alloca i64, align 8 +// CK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CK-64-NEXT: [[I:%.*]] = alloca i32, align 4 +// CK-64-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CK-64-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CK-64-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CK-64-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CK-64-NEXT: store i64 [[X]], ptr [[X_ADDR]], align 8 +// CK-64-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CK-64-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CK-64-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32 +// CK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CK-64-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32 +// CK-64-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4 +// CK-64-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CK-64-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CK-64-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CK-64-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CK-64-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CK-64-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CK-64: [[COND_TRUE]]: +// CK-64-NEXT: br label %[[COND_END:.*]] +// CK-64: [[COND_FALSE]]: +// CK-64-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: br label %[[COND_END]] +// CK-64: [[COND_END]]: +// CK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP5]], %[[COND_FALSE]] ] +// CK-64-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CK-64-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CK-64: [[OMP_INNER_FOR_COND]]: +// CK-64-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-64-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CK-64-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CK-64: [[OMP_INNER_FOR_BODY]]: +// CK-64-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CK-64-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CK-64-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CK-64-NEXT: [[TMP10:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-64-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CK-64-NEXT: store i32 [[ADD3]], ptr [[X_ADDR]], align 4 +// CK-64-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CK-64: [[OMP_BODY_CONTINUE]]: +// CK-64-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CK-64: [[OMP_INNER_FOR_INC]]: +// CK-64-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1 +// CK-64-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4 +// CK-64-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CK-64: [[OMP_INNER_FOR_END]]: +// CK-64-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CK-64: [[OMP_LOOP_EXIT]]: +// CK-64-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]]) +// CK-64-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo1i( +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-32-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP1]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CK-32-NEXT: br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23(ptr [[D]]) #[[ATTR2:[0-9]+]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1i_l23( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]]) #[[ATTR1:[0-9]+]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-32-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20:![0-9]+]], !align [[META21:![0-9]+]] +// CK-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// CK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// CK-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo2v( +// CK-32-SAME: ) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP0]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP1]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP5]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP13]], align 8 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 +// CK-32-NEXT: br i1 [[TMP19]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32(ptr [[PVTARR]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l32( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo3v( +// CK-32-SAME: ) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[TMP0]], ptr [[TMP1]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[TMP0]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.3, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CK-32-NEXT: br i1 [[TMP20]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41(ptr [[TMP0]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l41( +// CK-32-SAME: ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PA1:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4 +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo4v( +// CK-32-SAME: ) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[P:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[P_CASTED:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +// CK-32-NEXT: store i32 [[TMP0]], ptr [[P_CASTED]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[P_CASTED]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.5, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP15]], align 8 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +// CK-32-NEXT: br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52(i32 [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l52( +// CK-32-SAME: i32 [[P:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[P_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: store i32 [[P]], ptr [[P_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[P_ADDR]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[P_ADDR]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo5i( +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP5]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CK-32-NEXT: store ptr null, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CK-32-NEXT: store ptr null, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 3, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP11]], ptr [[TMP15]], align 4 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP12]], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.7, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP20]], align 4 +// CK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP21]], align 8 +// CK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP22]], align 8 +// CK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4 +// CK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4 +// CK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP25]], align 4 +// CK-32-NEXT: [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CK-32-NEXT: br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo5i_l66( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[PA3:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP2:%.*]] = load double, ptr [[D1]], align 8 +// CK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], 1.000000e+00 +// CK-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5 +// CK-32-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = load ptr, ptr [[PA3]], align 4 +// CK-32-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 50 +// CK-32-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +// CK-32-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP5]], 1 +// CK-32-NEXT: store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo6i( +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP5]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CK-32-NEXT: store ptr null, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CK-32-NEXT: store ptr null, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 3, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP11]], ptr [[TMP15]], align 4 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP12]], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.9, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.10, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP20]], align 4 +// CK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP21]], align 8 +// CK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP22]], align 8 +// CK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4 +// CK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4 +// CK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP25]], align 4 +// CK-32-NEXT: [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CK-32-NEXT: br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo6i_l82( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-32-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8 +// CK-32-NEXT: store double [[TMP2]], ptr [[D1]], align 8 +// CK-32-NEXT: [[TMP3:%.*]] = load double, ptr [[D1]], align 8 +// CK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 1.000000e+00 +// CK-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP1]], i32 0, i32 5 +// CK-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = load ptr, ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 50 +// CK-32-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +// CK-32-NEXT: [[INC3:%.*]] = add nsw i32 [[TMP6]], 1 +// CK-32-NEXT: store i32 [[INC3]], ptr [[ARRAYIDX2]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo7i( +// CK-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[D:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CK-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// CK-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PA]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store ptr [[D]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP5]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[TMP6]], align 4 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CK-32-NEXT: store ptr null, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP1]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CK-32-NEXT: store ptr null, ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 3, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP11]], ptr [[TMP15]], align 4 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP12]], ptr [[TMP16]], align 4 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.11, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.12, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP20]], align 4 +// CK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 0, ptr [[TMP21]], align 8 +// CK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP22]], align 8 +// CK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP23]], align 4 +// CK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP24]], align 4 +// CK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP25]], align 4 +// CK-32-NEXT: [[TMP26:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0 +// CK-32-NEXT: br i1 [[TMP27]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98(ptr [[D]], ptr [[PVTARR]], ptr [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo7i_l98( +// CK-32-SAME: ptr nonnull align 4 dereferenceable(8) [[D:%.*]], ptr nonnull align 4 dereferenceable(40) [[PVTARR:%.*]], ptr [[PA:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PVTARR_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[PA_ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// CK-32-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// CK-32-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PVTARR]], ptr [[PVTARR_ADDR]], align 4 +// CK-32-NEXT: store ptr [[PA]], ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP1:%.*]] = load ptr, ptr [[PVTARR_ADDR]], align 4, !nonnull [[META20]], !align [[META21]] +// CK-32-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8 +// CK-32-NEXT: store double [[TMP2]], ptr [[D1]], align 8 +// CK-32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[PVTARR2]], ptr align 4 [[TMP1]], i32 40, i1 false) +// CK-32-NEXT: [[TMP3:%.*]] = load double, ptr [[D1]], align 8 +// CK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP3]], 1.000000e+00 +// CK-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// CK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5 +// CK-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CK-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = load ptr, ptr [[PA_ADDR]], align 4 +// CK-32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 50 +// CK-32-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +// CK-32-NEXT: [[INC4:%.*]] = add nsw i32 [[TMP6]], 1 +// CK-32-NEXT: store i32 [[INC4]], ptr [[ARRAYIDX3]], align 4 +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define dso_local void @_Z4foo8v( +// CK-32-SAME: ) #[[ATTR0]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[X:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[X_CASTED:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4 +// CK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CK-32-NEXT: store i32 0, ptr [[X]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[X]], align 4 +// CK-32-NEXT: store i32 [[TMP0]], ptr [[X_CASTED]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[X_CASTED]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[TMP2]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CK-32-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CK-32-NEXT: store i32 3, ptr [[TMP7]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CK-32-NEXT: store i32 1, ptr [[TMP8]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CK-32-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CK-32-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 4 +// CK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CK-32-NEXT: store ptr @.offload_sizes.13, ptr [[TMP11]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CK-32-NEXT: store ptr @.offload_maptypes.14, ptr [[TMP12]], align 4 +// CK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CK-32-NEXT: store ptr null, ptr [[TMP13]], align 4 +// CK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CK-32-NEXT: store ptr null, ptr [[TMP14]], align 4 +// CK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CK-32-NEXT: store i64 10, ptr [[TMP15]], align 8 +// CK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CK-32-NEXT: store i64 0, ptr [[TMP16]], align 8 +// CK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4 +// CK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CK-32-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4 +// CK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CK-32-NEXT: store i32 0, ptr [[TMP19]], align 4 +// CK-32-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.region_id, ptr [[KERNEL_ARGS]]) +// CK-32-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +// CK-32-NEXT: br i1 [[TMP21]], label %[[OMP_OFFLOAD_FAILED:.*]], label %[[OMP_OFFLOAD_CONT:.*]] +// CK-32: [[OMP_OFFLOAD_FAILED]]: +// CK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112(i32 [[TMP1]]) #[[ATTR2]] +// CK-32-NEXT: br label %[[OMP_OFFLOAD_CONT]] +// CK-32: [[OMP_OFFLOAD_CONT]]: +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112( +// CK-32-SAME: i32 [[X:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[X_CASTED:%.*]] = alloca i32, align 4 +// CK-32-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-32-NEXT: store i32 [[TMP0]], ptr [[X_CASTED]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[X_CASTED]], align 4 +// CK-32-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined, i32 [[TMP1]]) +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined( +// CK-32-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[X:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[I:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[X_CASTED:%.*]] = alloca i32, align 4 +// CK-32-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CK-32-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CK-32-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 +// CK-32-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-32-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CK-32-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +// CK-32-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CK-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9 +// CK-32-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CK-32: [[COND_TRUE]]: +// CK-32-NEXT: br label %[[COND_END:.*]] +// CK-32: [[COND_FALSE]]: +// CK-32-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: br label %[[COND_END]] +// CK-32: [[COND_END]]: +// CK-32-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ] +// CK-32-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-32-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CK-32: [[OMP_INNER_FOR_COND]]: +// CK-32-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]] +// CK-32-NEXT: br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CK-32: [[OMP_INNER_FOR_BODY]]: +// CK-32-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CK-32-NEXT: [[TMP9:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-32-NEXT: store i32 [[TMP9]], ptr [[X_CASTED]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = load i32, ptr [[X_CASTED]], align 4 +// CK-32-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]], i32 [[TMP10]]) +// CK-32-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CK-32: [[OMP_INNER_FOR_INC]]: +// CK-32-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4 +// CK-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]] +// CK-32-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CK-32: [[OMP_INNER_FOR_END]]: +// CK-32-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CK-32: [[OMP_LOOP_EXIT]]: +// CK-32-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]]) +// CK-32-NEXT: ret void +// +// +// CK-32-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo8v_l112.omp_outlined.omp_outlined( +// CK-32-SAME: ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[DOTPREVIOUS_LB_:%.*]], i32 [[DOTPREVIOUS_UB_:%.*]], i32 [[X:%.*]]) #[[ATTR1]] { +// CK-32-NEXT: [[ENTRY:.*:]] +// CK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CK-32-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CK-32-NEXT: [[I:%.*]] = alloca i32, align 4 +// CK-32-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CK-32-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CK-32-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CK-32-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CK-32-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 +// CK-32-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CK-32-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CK-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CK-32-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4 +// CK-32-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CK-32-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CK-32-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CK-32-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +// CK-32-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CK-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9 +// CK-32-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CK-32: [[COND_TRUE]]: +// CK-32-NEXT: br label %[[COND_END:.*]] +// CK-32: [[COND_FALSE]]: +// CK-32-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: br label %[[COND_END]] +// CK-32: [[COND_END]]: +// CK-32-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP5]], %[[COND_FALSE]] ] +// CK-32-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CK-32-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CK-32: [[OMP_INNER_FOR_COND]]: +// CK-32-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CK-32-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]] +// CK-32-NEXT: br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CK-32: [[OMP_INNER_FOR_BODY]]: +// CK-32-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1 +// CK-32-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CK-32-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CK-32-NEXT: [[TMP10:%.*]] = load i32, ptr [[X_ADDR]], align 4 +// CK-32-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1 +// CK-32-NEXT: store i32 [[ADD2]], ptr [[X_ADDR]], align 4 +// CK-32-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CK-32: [[OMP_BODY_CONTINUE]]: +// CK-32-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CK-32: [[OMP_INNER_FOR_INC]]: +// CK-32-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1 +// CK-32-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4 +// CK-32-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CK-32: [[OMP_INNER_FOR_END]]: +// CK-32-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CK-32: [[OMP_LOOP_EXIT]]: +// CK-32-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]]) +// CK-32-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo1i( +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[D1:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-64-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo2v( +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i64 0, i64 5 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo3v( +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: [[PA1:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 50 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo4v( +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[P:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[P]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo5i( +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: [[D1:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PA3:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-64-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i64 0, i64 5 +// SIMD-ONLY-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// SIMD-ONLY-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +// SIMD-ONLY-64-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo6i( +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-64-NEXT: store double [[ADD]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i64 0, i64 5 +// SIMD-ONLY-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// SIMD-ONLY-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-64-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo7i( +// SIMD-ONLY-64-SAME: i32 signext [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-64-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-64-NEXT: [[PA:%.*]] = alloca ptr, align 8 +// SIMD-ONLY-64-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-64-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-64-NEXT: store double [[ADD]], ptr [[D]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i64 0, i64 5 +// SIMD-ONLY-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-64-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 8 +// SIMD-ONLY-64-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 50 +// SIMD-ONLY-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-64-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-64-LABEL: define dso_local void @_Z4foo8v( +// SIMD-ONLY-64-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-64-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-64-NEXT: [[X:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: [[I:%.*]] = alloca i32, align 4 +// SIMD-ONLY-64-NEXT: store i32 0, ptr [[X]], align 4 +// SIMD-ONLY-64-NEXT: store i32 0, ptr [[I]], align 4 +// SIMD-ONLY-64-NEXT: br label %[[FOR_COND:.*]] +// SIMD-ONLY-64: [[FOR_COND]]: +// SIMD-ONLY-64-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 +// SIMD-ONLY-64-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10 +// SIMD-ONLY-64-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +// SIMD-ONLY-64: [[FOR_BODY]]: +// SIMD-ONLY-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[X]], align 4 +// SIMD-ONLY-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[ADD]], ptr [[X]], align 4 +// SIMD-ONLY-64-NEXT: br label %[[FOR_INC:.*]] +// SIMD-ONLY-64: [[FOR_INC]]: +// SIMD-ONLY-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// SIMD-ONLY-64-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-64-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// SIMD-ONLY-64-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]] +// SIMD-ONLY-64: [[FOR_END]]: +// SIMD-ONLY-64-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo1i( +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo2v( +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PVTARR1:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR1]], i32 0, i32 5 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo3v( +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: [[PA1:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PA1]], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 50 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo4v( +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[P:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[P]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo5i( +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: [[D1:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[PVTARR2:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PA3:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D1]], align 8 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-32-NEXT: store double [[ADD]], ptr [[D1]], align 8 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR2]], i32 0, i32 5 +// SIMD-ONLY-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA3]], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50 +// SIMD-ONLY-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +// SIMD-ONLY-32-NEXT: [[INC5:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC5]], ptr [[ARRAYIDX4]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo6i( +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-32-NEXT: store double [[ADD]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i32 0, i32 5 +// SIMD-ONLY-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50 +// SIMD-ONLY-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-32-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo7i( +// SIMD-ONLY-32-SAME: i32 [[A:%.*]]) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[D:%.*]] = alloca double, align 8 +// SIMD-ONLY-32-NEXT: [[PVTARR:%.*]] = alloca [10 x i32], align 4 +// SIMD-ONLY-32-NEXT: [[PA:%.*]] = alloca ptr, align 4 +// SIMD-ONLY-32-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// SIMD-ONLY-32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +// SIMD-ONLY-32-NEXT: store double [[CONV]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load double, ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 1.000000e+00 +// SIMD-ONLY-32-NEXT: store double [[ADD]], ptr [[D]], align 8 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[PVTARR]], i32 0, i32 5 +// SIMD-ONLY-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 +// SIMD-ONLY-32-NEXT: [[TMP3:%.*]] = load ptr, ptr [[PA]], align 4 +// SIMD-ONLY-32-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 50 +// SIMD-ONLY-32-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-32-NEXT: [[INC2:%.*]] = add nsw i32 [[TMP4]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC2]], ptr [[ARRAYIDX1]], align 4 +// SIMD-ONLY-32-NEXT: ret void +// +// +// SIMD-ONLY-32-LABEL: define dso_local void @_Z4foo8v( +// SIMD-ONLY-32-SAME: ) #[[ATTR0]] { +// SIMD-ONLY-32-NEXT: [[ENTRY:.*:]] +// SIMD-ONLY-32-NEXT: [[X:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: [[I:%.*]] = alloca i32, align 4 +// SIMD-ONLY-32-NEXT: store i32 0, ptr [[X]], align 4 +// SIMD-ONLY-32-NEXT: store i32 0, ptr [[I]], align 4 +// SIMD-ONLY-32-NEXT: br label %[[FOR_COND:.*]] +// SIMD-ONLY-32: [[FOR_COND]]: +// SIMD-ONLY-32-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 +// SIMD-ONLY-32-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10 +// SIMD-ONLY-32-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +// SIMD-ONLY-32: [[FOR_BODY]]: +// SIMD-ONLY-32-NEXT: [[TMP1:%.*]] = load i32, ptr [[X]], align 4 +// SIMD-ONLY-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[ADD]], ptr [[X]], align 4 +// SIMD-ONLY-32-NEXT: br label %[[FOR_INC:.*]] +// SIMD-ONLY-32: [[FOR_INC]]: +// SIMD-ONLY-32-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// SIMD-ONLY-32-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 +// SIMD-ONLY-32-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// SIMD-ONLY-32-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// SIMD-ONLY-32: [[FOR_END]]: +// SIMD-ONLY-32-NEXT: ret void +// +//. +// CK-64: [[META19]] = !{} +// CK-64: [[META20]] = !{i64 4} +//. +// CK-32: [[META20]] = !{} +// CK-32: [[META21]] = !{i64 4} +//. +// SIMD-ONLY-64: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]} +// SIMD-ONLY-64: [[META3]] = !{!"llvm.loop.mustprogress"} +//. +// SIMD-ONLY-32: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +// SIMD-ONLY-32: [[META4]] = !{!"llvm.loop.mustprogress"} +//. diff --git a/clang/test/OpenMP/target_default_messages.cpp b/clang/test/OpenMP/target_default_messages.cpp index be677dffa21ca..6a1a1f99360b5 100644 --- a/clang/test/OpenMP/target_default_messages.cpp +++ b/clang/test/OpenMP/target_default_messages.cpp @@ -24,6 +24,8 @@ int main(int argc, char **argv) { for (int i=0; i<200; i++) foo(); #pragma omp target default(x) // expected-error {{expected 'none', 'shared', 'private' or 'firstprivate' in OpenMP clause 'default'}} for (int i=0; i<200; i++) foo(); +#pragma omp target default(none) // expected-note {{explicit data sharing attribute, data mapping attribute, or is_device_ptr clause requested here}} + x++; // expected-error {{variable 'x' must have explicitly specified data sharing attributes, data mapping attributes, or in an is_device_ptr clause}} #endif #ifdef OMP52 diff --git a/clang/test/Sema/warn-unreachable-file-scope.c b/clang/test/Sema/warn-unreachable-file-scope.c new file mode 100644 index 0000000000000..64a6918cbcf77 --- /dev/null +++ b/clang/test/Sema/warn-unreachable-file-scope.c @@ -0,0 +1,37 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +typedef unsigned char u8; + +u8 a1 = (0 ? 0xffff : 0xff); +u8 a2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} +u8 a3 = (1 ? 0xff : 0xffff); +u8 a4 = (0 ? 0xff : 0xffff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} + +unsigned long long b1 = 1 ? 0 : 1ULL << 64; +unsigned long long b2 = 0 ? 0 : 1ULL << 64; // expected-warning {{shift count >= width of type}} +unsigned long long b3 = 1 ? 1ULL << 64 : 0; // expected-warning {{shift count >= width of type}} + +#define M(n) (((n) == 64) ? ~0ULL : ((1ULL << (n)) - 1)) +unsigned long long c1 = M(64); +unsigned long long c2 = M(32); + +static u8 d1 = (0 ? 0xffff : 0xff); +static u8 d2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} + +int a = 1 ? 6 : (1,2); +int b = 0 ? 6 : (1,2); // expected-warning {{left operand of comma operator has no effect}} + +void f(void) { + u8 e1 = (0 ? 0xffff : 0xff); + u8 e2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} + + unsigned long long e3 = 1 ? 0 : 1ULL << 64; + unsigned long long e4 = 0 ? 0 : 1ULL << 64; // expected-warning {{shift count >= width of type}} +} + +void statics(void) { + static u8 f1 = (0 ? 0xffff : 0xff); + static u8 f2 = (1 ? 0xffff : 0xff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} + static u8 f3 = (1 ? 0xff : 0xffff); + static u8 f4 = (0 ? 0xff : 0xffff); // expected-warning {{implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from 65535 to 255}} +} diff --git a/compiler-rt/lib/asan/tests/asan_test.cpp b/compiler-rt/lib/asan/tests/asan_test.cpp index 2d23a12cc6ae2..06c9fdc9b23db 100644 --- a/compiler-rt/lib/asan/tests/asan_test.cpp +++ b/compiler-rt/lib/asan/tests/asan_test.cpp @@ -1115,15 +1115,37 @@ TEST(AddressSanitizer, StressStackReuseTest) { LotsOfStackReuse(); } +// On some platform (ex: AIX), the default thread stack size (~96 KB) is +// insufficient for this test and can lead to stack overflows. +#define MIN_STACK_SIZE (128 * 1024) // 128 KB TEST(AddressSanitizer, ThreadedStressStackReuseTest) { const int kNumThreads = 20; pthread_t t[kNumThreads]; +// pthread_attr isn't supported on Windows. +#ifndef _WIN32 + size_t curStackSize = 0; + pthread_attr_t attr; + pthread_attr_init(&attr); + // Get the current (default) thread stack size + pthread_attr_getstacksize(&attr, &curStackSize); + if (curStackSize < MIN_STACK_SIZE) { + int rc = pthread_attr_setstacksize(&attr, MIN_STACK_SIZE); + ASSERT_EQ(0, rc); + } +#endif for (int i = 0; i < kNumThreads; i++) { - PTHREAD_CREATE(&t[i], 0, (void* (*)(void *x))LotsOfStackReuse, 0); +#ifdef _WIN32 + PTHREAD_CREATE(&t[i], 0, (void* (*)(void* x))LotsOfStackReuse, 0); +#else + PTHREAD_CREATE(&t[i], &attr, (void* (*)(void* x))LotsOfStackReuse, 0); +#endif } for (int i = 0; i < kNumThreads; i++) { PTHREAD_JOIN(t[i], 0); } +#ifndef _WIN32 + pthread_attr_destroy(&attr); +#endif } // pthread_exit tries to perform unwinding stuff that leads to dlopen'ing diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index 285998e4d5a91..a9520bf974e5d 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -57,7 +57,7 @@ CODEGENOPT(DeferDescriptorMapping, 1, 0) ///< Fortran OpenMP specific optimisati ENUM_CODEGENOPT(RelocationModel, llvm::Reloc::Model, 3, llvm::Reloc::PIC_) ///< Name of the relocation model to use. ENUM_CODEGENOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4, llvm::codegenoptions::NoDebugInfo) ///< Level of debug info to generate ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 4, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use -ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers +ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 3, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers ENUM_CODEGENOPT(ComplexRange, ComplexRangeKind, 3, ComplexRangeKind::CX_Full) ///< Method for calculating complex number division ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 8842970dde613..26ed6c73b5912 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -309,6 +309,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, llvm::StringSwitch>(a->getValue()) .Case("none", llvm::FramePointerKind::None) .Case("non-leaf", llvm::FramePointerKind::NonLeaf) + .Case("non-leaf-no-reserve", + llvm::FramePointerKind::NonLeafNoReserve) .Case("reserved", llvm::FramePointerKind::Reserved) .Case("all", llvm::FramePointerKind::All) .Default(std::nullopt); diff --git a/flang/test/Driver/frame-pointer-forwarding.f90 b/flang/test/Driver/frame-pointer-forwarding.f90 index 9fcbd6e12f98b..7e97c98d899f1 100644 --- a/flang/test/Driver/frame-pointer-forwarding.f90 +++ b/flang/test/Driver/frame-pointer-forwarding.f90 @@ -1,12 +1,12 @@ ! Test that flang forwards -fno-omit-frame-pointer and -fomit-frame-pointer Flang frontend ! RUN: %flang --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NOVALUE -! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf" +! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf-no-reserve" ! RUN: %flang -fomit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NONEFP ! CHECK-NONEFP: "-fc1"{{.*}}"-mframe-pointer=none" ! RUN: %flang -fno-omit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-NONLEAFFP -! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf" +! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf-no-reserve" ! RUN: %flang -fno-omit-frame-pointer --target=x86-none-none -fsyntax-only -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-ALLFP ! CHECK-ALLFP: "-fc1"{{.*}}"-mframe-pointer=all" diff --git a/libc/config/baremetal/aarch64/entrypoints.txt b/libc/config/baremetal/aarch64/entrypoints.txt index 049adb34d9d79..c69ab3d0bb37c 100644 --- a/libc/config/baremetal/aarch64/entrypoints.txt +++ b/libc/config/baremetal/aarch64/entrypoints.txt @@ -323,6 +323,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.acos libc.src.math.acosf libc.src.math.acoshf + libc.src.math.asin libc.src.math.asinf libc.src.math.asinhf libc.src.math.atan2 diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index 2444ec5feff01..c566f8ad08c8e 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -326,6 +326,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.acos libc.src.math.acosf libc.src.math.acoshf + libc.src.math.asin libc.src.math.asinf libc.src.math.asinhf libc.src.math.atan2 diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index e0dd15b803253..42571862b24b2 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -327,6 +327,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.execve libc.src.unistd.faccessat libc.src.unistd.fchdir + libc.src.unistd.fchown libc.src.unistd.fpathconf libc.src.unistd.fsync libc.src.unistd.ftruncate @@ -1143,6 +1144,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 0d031d8844f13..b62a46b7178d5 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -1273,6 +1273,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index a44e2041e57f2..8a46a7a1baae3 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -334,6 +334,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.execve libc.src.unistd.faccessat libc.src.unistd.fchdir + libc.src.unistd.fchown libc.src.unistd.fpathconf libc.src.unistd.fsync libc.src.unistd.ftruncate @@ -1313,6 +1314,7 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.time.ctime_r libc.src.time.clock libc.src.time.clock_gettime + libc.src.time.clock_settime libc.src.time.difftime libc.src.time.gettimeofday libc.src.time.gmtime diff --git a/libc/docs/headers/time.rst b/libc/docs/headers/time.rst index 55bc1a17ee285..f07e0d93a4ce6 100644 --- a/libc/docs/headers/time.rst +++ b/libc/docs/headers/time.rst @@ -67,11 +67,11 @@ Implementation Status +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | clock_getres | | | | | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| clock_gettime | |check| | |check| | | |check| | | | | | | | | | | +| clock_gettime | |check| | |check| | | |check| | | | | | | | | |check| | |check| | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | clock_nanosleep | | | | | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| clock_settime | | | | | | | | | | | | | | +| clock_settime | |check| | |check| | | |check| | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | ctime | |check| | |check| | | |check| | | | | | | | | | | +---------------------+---------+---------+---------+-----------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ diff --git a/libc/include/time.yaml b/libc/include/time.yaml index 88e50d1288238..c2b8a1e4cfb8e 100644 --- a/libc/include/time.yaml +++ b/libc/include/time.yaml @@ -67,6 +67,13 @@ functions: arguments: - type: clockid_t - type: struct timespec * + - name: clock_settime + standard: + - POSIX + return_type: int + arguments: + - type: clockid_t + - type: const struct timespec * - name: difftime standard: - stdc diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml index 0e5b22e627b67..3f5e957768533 100644 --- a/libc/include/unistd.yaml +++ b/libc/include/unistd.yaml @@ -120,6 +120,14 @@ functions: return_type: int arguments: - type: int + - name: fchown + standards: + - POSIX + return_type: int + arguments: + - type: int + - type: uid_t + - type: gid_t - name: fork standards: - POSIX diff --git a/libc/src/__support/time/CMakeLists.txt b/libc/src/__support/time/CMakeLists.txt index 8247e792e8410..3851037e4161f 100644 --- a/libc/src/__support/time/CMakeLists.txt +++ b/libc/src/__support/time/CMakeLists.txt @@ -19,3 +19,12 @@ add_object_library( DEPENDS libc.src.__support.time.${LIBC_TARGET_OS}.clock_gettime ) + +if(TARGET libc.src.__support.time.${LIBC_TARGET_OS}.clock_settime) + add_object_library( + clock_settime + ALIAS + DEPENDS + libc.src.__support.time.${LIBC_TARGET_OS}.clock_settime + ) +endif() diff --git a/libc/src/__support/time/clock_settime.h b/libc/src/__support/time/clock_settime.h new file mode 100644 index 0000000000000..d8d305cadf4b9 --- /dev/null +++ b/libc/src/__support/time/clock_settime.h @@ -0,0 +1,22 @@ +//===--- clock_settime linux implementation ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H +#define LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H + +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/error_or.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { +ErrorOr clock_settime(clockid_t clockid, const timespec *ts); +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_TIME_CLOCK_SETTIME_H diff --git a/libc/src/__support/time/linux/CMakeLists.txt b/libc/src/__support/time/linux/CMakeLists.txt index 6fec7eeba99ad..478529502b403 100644 --- a/libc/src/__support/time/linux/CMakeLists.txt +++ b/libc/src/__support/time/linux/CMakeLists.txt @@ -14,6 +14,21 @@ add_object_library( libc.src.__support.OSUtil.linux.vdso ) +add_object_library( + clock_settime + HDRS + ../clock_settime.h + SRCS + clock_settime.cpp + DEPENDS + libc.include.sys_syscall + libc.hdr.types.struct_timespec + libc.hdr.types.clockid_t + libc.src.__support.common + libc.src.__support.error_or + libc.src.__support.OSUtil.osutil +) + add_header_library( clock_conversion HDRS diff --git a/libc/src/__support/time/linux/clock_settime.cpp b/libc/src/__support/time/linux/clock_settime.cpp new file mode 100644 index 0000000000000..dd42610adb031 --- /dev/null +++ b/libc/src/__support/time/linux/clock_settime.cpp @@ -0,0 +1,53 @@ +//===--- clock_settime linux implementation ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/time/clock_settime.h" +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/OSUtil/syscall.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/macros/config.h" +#include + +#if defined(SYS_clock_settime64) +#include +#endif + +namespace LIBC_NAMESPACE_DECL { +namespace internal { +ErrorOr clock_settime(clockid_t clockid, const timespec *ts) { + int ret; +#if defined(SYS_clock_settime) + ret = LIBC_NAMESPACE::syscall_impl(SYS_clock_settime, + static_cast(clockid), + reinterpret_cast(ts)); +#elif defined(SYS_clock_settime64) + static_assert( + sizeof(time_t) == sizeof(int64_t), + "SYS_clock_settime64 requires struct timespec with 64-bit members."); + + __kernel_timespec ts64{}; + + // Populate the 64-bit kernel structure from the user-provided timespec + ts64.tv_sec = static_cast(ts->tv_sec); + ts64.tv_nsec = static_cast(ts->tv_nsec); + + ret = LIBC_NAMESPACE::syscall_impl(SYS_clock_settime64, + static_cast(clockid), + reinterpret_cast(&ts64)); +#else +#error "SYS_clock_settime and SYS_clock_settime64 syscalls not available." +#endif + if (ret < 0) + return Error(-ret); + return ret; +} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt index ec942e38d1af5..4d647c22c3239 100644 --- a/libc/src/time/CMakeLists.txt +++ b/libc/src/time/CMakeLists.txt @@ -245,3 +245,11 @@ add_entrypoint_object( DEPENDS .${LIBC_TARGET_OS}.clock_getres ) + +add_entrypoint_object( + clock_settime + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.clock_settime +) + diff --git a/libc/src/time/clock_settime.h b/libc/src/time/clock_settime.h new file mode 100644 index 0000000000000..9321dd1074101 --- /dev/null +++ b/libc/src/time/clock_settime.h @@ -0,0 +1,22 @@ +//===-- Implementation header for clock_settime function --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H +#define LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H + +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int clock_settime(clockid_t clockid, const timespec *tp); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_TIME_CLOCK_SETTIME_H diff --git a/libc/src/time/linux/CMakeLists.txt b/libc/src/time/linux/CMakeLists.txt index a6ec7c7c06963..6ea04597063cb 100644 --- a/libc/src/time/linux/CMakeLists.txt +++ b/libc/src/time/linux/CMakeLists.txt @@ -54,6 +54,19 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + clock_settime + SRCS + clock_settime.cpp + HDRS + ../clock_settime.h + DEPENDS + libc.hdr.types.clockid_t + libc.hdr.types.struct_timespec + libc.src.__support.time.clock_settime + libc.src.errno.errno +) + add_entrypoint_object( gettimeofday SRCS diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp index c38697cd0668e..c560bd10be83c 100644 --- a/libc/src/time/linux/clock.cpp +++ b/libc/src/time/linux/clock.cpp @@ -19,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(clock_t, clock, ()) { using namespace time_units; - struct timespec ts; + timespec ts; auto result = internal::clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); if (!result.has_value()) { libc_errno = result.error(); diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp index b3fcd2b22f9da..52ace2a743dd4 100644 --- a/libc/src/time/linux/clock_gettime.cpp +++ b/libc/src/time/linux/clock_gettime.cpp @@ -15,8 +15,7 @@ namespace LIBC_NAMESPACE_DECL { // TODO(michaelrj): Move this into time/linux with the other syscalls. -LLVM_LIBC_FUNCTION(int, clock_gettime, - (clockid_t clockid, struct timespec *ts)) { +LLVM_LIBC_FUNCTION(int, clock_gettime, (clockid_t clockid, timespec *ts)) { auto result = internal::clock_gettime(clockid, ts); // A negative return value indicates an error with the magnitude of the diff --git a/libc/src/time/linux/clock_settime.cpp b/libc/src/time/linux/clock_settime.cpp new file mode 100644 index 0000000000000..3c582cf0b4646 --- /dev/null +++ b/libc/src/time/linux/clock_settime.cpp @@ -0,0 +1,30 @@ +//===---------- Linux implementation of the POSIX clock_settime function --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/time/clock_settime.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/time/clock_settime.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, clock_settime, + (clockid_t clockid, const timespec *ts)) { + auto result = internal::clock_settime(clockid, ts); + + // A negative return value indicates an error with the magnitude of the + // value being the error code. + if (!result.has_value()) { + libc_errno = result.error(); + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp index e5df1585df988..a30b97de40492 100644 --- a/libc/src/time/linux/nanosleep.cpp +++ b/libc/src/time/linux/nanosleep.cpp @@ -18,8 +18,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, nanosleep, - (const struct timespec *req, struct timespec *rem)) { +LLVM_LIBC_FUNCTION(int, nanosleep, (const timespec *req, timespec *rem)) { #if SYS_nanosleep int ret = LIBC_NAMESPACE::syscall_impl(SYS_nanosleep, req, rem); #elif defined(SYS_clock_nanosleep_time64) diff --git a/libc/src/time/linux/timespec_get.cpp b/libc/src/time/linux/timespec_get.cpp index a4d4372332732..031cb9f83b1c3 100644 --- a/libc/src/time/linux/timespec_get.cpp +++ b/libc/src/time/linux/timespec_get.cpp @@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(int, timespec_get, (struct timespec * ts, int base)) { +LLVM_LIBC_FUNCTION(int, timespec_get, (timespec * ts, int base)) { clockid_t clockid; switch (base) { case TIME_UTC: diff --git a/libc/src/time/strftime.cpp b/libc/src/time/strftime.cpp index 89b7d9bb7c1b9..ff8c05a0b07da 100644 --- a/libc/src/time/strftime.cpp +++ b/libc/src/time/strftime.cpp @@ -23,10 +23,10 @@ LLVM_LIBC_FUNCTION(size_t, strftime, printf_core::WriteMode::FILL_BUFF_AND_DROP_OVERFLOW>::value> wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret = strftime_core::strftime_main(&writer, format, timeptr); + auto ret = strftime_core::strftime_main(&writer, format, timeptr); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast(ret) >= buffsz) ? 0 : ret; + return (!ret.has_value() || ret.value() >= buffsz) ? 0 : ret.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_core/CMakeLists.txt b/libc/src/time/strftime_core/CMakeLists.txt index 3ffd283ead7fe..a9aa573cc9a63 100644 --- a/libc/src/time/strftime_core/CMakeLists.txt +++ b/libc/src/time/strftime_core/CMakeLists.txt @@ -43,6 +43,7 @@ add_header_library( .core_structs .parser .converter + libc.src.__support.error_or libc.src.stdio.printf_core.writer libc.hdr.types.struct_tm ) diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h index 2b136d83234cd..855a44107914c 100644 --- a/libc/src/time/strftime_core/strftime_main.h +++ b/libc/src/time/strftime_core/strftime_main.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_STDIO_STRFTIME_CORE_STRFTIME_MAIN_H #include "hdr/types/struct_tm.h" +#include "src/__support/error_or.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/writer.h" #include "src/time/strftime_core/converter.h" @@ -20,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL { namespace strftime_core { template -int strftime_main(printf_core::Writer *writer, - const char *__restrict str, const tm *timeptr) { +ErrorOr strftime_main(printf_core::Writer *writer, + const char *__restrict str, const tm *timeptr) { Parser parser(str); int result = 0; for (strftime_core::FormatSection cur_section = parser.get_next_section(); @@ -33,11 +34,10 @@ int strftime_main(printf_core::Writer *writer, result = writer->write(cur_section.raw_string); if (result < 0) - return result; + return Error(-result); } - // TODO: Use ErrorOr - return static_cast(writer->get_chars_written()); + return writer->get_chars_written(); } } // namespace strftime_core diff --git a/libc/src/time/strftime_l.cpp b/libc/src/time/strftime_l.cpp index 409f8683b7289..2ec90634ea347 100644 --- a/libc/src/time/strftime_l.cpp +++ b/libc/src/time/strftime_l.cpp @@ -26,10 +26,10 @@ LLVM_LIBC_FUNCTION(size_t, strftime_l, printf_core::WriteMode::FILL_BUFF_AND_DROP_OVERFLOW>::value> wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret = strftime_core::strftime_main(&writer, format, timeptr); + auto ret = strftime_core::strftime_main(&writer, format, timeptr); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return (ret < 0 || static_cast(ret) >= buffsz) ? 0 : ret; + return (!ret.has_value() || ret.value() >= buffsz) ? 0 : ret.value(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt index 337480cbbf928..b7444a4722b0d 100644 --- a/libc/src/unistd/CMakeLists.txt +++ b/libc/src/unistd/CMakeLists.txt @@ -76,6 +76,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.fchdir ) +add_entrypoint_object( + fchown + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.fchown +) + add_entrypoint_object( fork ALIAS diff --git a/libc/src/unistd/fchown.h b/libc/src/unistd/fchown.h new file mode 100644 index 0000000000000..9ea44426568cc --- /dev/null +++ b/libc/src/unistd/fchown.h @@ -0,0 +1,22 @@ +//===-- Implementation header for fchown ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_FCHOWN_H +#define LLVM_LIBC_SRC_UNISTD_FCHOWN_H + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int fchown(int fildes, uid_t owner, gid_t group); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_FCHOWN_H diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index c2dacc6456e27..c45b6ef1c5d80 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -120,6 +120,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + fchown + SRCS + fchown.cpp + HDRS + ../fchown.h + DEPENDS + libc.hdr.types.uid_t + libc.hdr.types.gid_t + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) + add_entrypoint_object( fork SRCS diff --git a/libc/src/unistd/linux/fchown.cpp b/libc/src/unistd/linux/fchown.cpp new file mode 100644 index 0000000000000..9cf3d139050c1 --- /dev/null +++ b/libc/src/unistd/linux/fchown.cpp @@ -0,0 +1,31 @@ +//===-- Linux implementation of fchown ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/fchown.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, fchown, (int fildes, uid_t owner, gid_t group)) { + int ret = LIBC_NAMESPACE::syscall_impl(SYS_fchown, fildes, owner, group); + if (ret < 0) { + libc_errno = -ret; + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/UnitTest/BazelFilePath.cpp b/libc/test/UnitTest/BazelFilePath.cpp index ee5fcaaa63d91..7f9f42b46dca9 100644 --- a/libc/test/UnitTest/BazelFilePath.cpp +++ b/libc/test/UnitTest/BazelFilePath.cpp @@ -20,6 +20,10 @@ namespace testing { CString libc_make_test_file_path_func(const char *file_name) { // This is the path to the folder bazel wants the test outputs written to. const char *UNDECLARED_OUTPUTS_PATH = getenv("TEST_UNDECLARED_OUTPUTS_DIR"); + // Do something sensible if not run under bazel, otherwise this may segfault + // when constructing the string. + if (UNDECLARED_OUTPUTS_PATH == nullptr) + UNDECLARED_OUTPUTS_PATH = ""; return cpp::string(UNDECLARED_OUTPUTS_PATH) + file_name; } diff --git a/libc/test/src/time/CMakeLists.txt b/libc/test/src/time/CMakeLists.txt index 03e5428292418..c8e113f06d50b 100644 --- a/libc/test/src/time/CMakeLists.txt +++ b/libc/test/src/time/CMakeLists.txt @@ -124,6 +124,21 @@ add_libc_test( libc.src.time.clock_getres ) +add_libc_test( + clock_settime_test + SUITE + libc_time_unittests + SRCS + clock_settime_test.cpp + DEPENDS + libc.src.time.clock_settime + libc.hdr.types.time_t + libc.hdr.types.struct_timespec + libc.hdr.time_macros + libc.hdr.errno_macros + libc.test.UnitTest.ErrnoCheckingTest +) + add_libc_unittest( difftime_test SUITE diff --git a/libc/test/src/time/clock_settime_test.cpp b/libc/test/src/time/clock_settime_test.cpp new file mode 100644 index 0000000000000..ccbad9ed2e847 --- /dev/null +++ b/libc/test/src/time/clock_settime_test.cpp @@ -0,0 +1,54 @@ +//===-- Unittests for clock_settime ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/time_macros.h" +#include "hdr/types/struct_timespec.h" +#include "src/time/clock_settime.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcClockSetTime = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +#ifdef CLOCK_MONOTONIC +TEST_F(LlvmLibcClockSetTime, MonotonicIsNotSettable) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_MONOTONIC, &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} +#endif // CLOCK_MONOTONIC + +TEST_F(LlvmLibcClockSetTime, InvalidClockId) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(static_cast(-1), &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} + +TEST_F(LlvmLibcClockSetTime, InvalidTimespecNsec) { + timespec ts = {0, 1000000000L}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, &ts); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EINVAL); +} + +TEST_F(LlvmLibcClockSetTime, NullPointerIsEFAULT) { + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, nullptr); + ASSERT_EQ(result, -1); + ASSERT_ERRNO_EQ(EFAULT); +} + +TEST_F(LlvmLibcClockSetTime, ClockIsSet) { + timespec ts = {0, 0}; + int result = LIBC_NAMESPACE::clock_settime(CLOCK_REALTIME, &ts); + if (result == 0) { + ASSERT_ERRNO_SUCCESS(); + } else { + ASSERT_ERRNO_EQ(EPERM); + } +} diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index 07070535459ec..3012ea9a466f4 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -146,6 +146,26 @@ add_libc_unittest( libc.test.UnitTest.ErrnoSetterMatcher ) +add_libc_unittest( + fchown_test + SUITE + libc_unistd_unittests + SRCS + fchown_test.cpp + DEPENDS + libc.hdr.fcntl_macros + libc.include.unistd + libc.src.errno.errno + libc.src.unistd.fchown + libc.src.unistd.close + libc.src.unistd.unlink + libc.src.fcntl.open + libc.src.unistd.getuid + libc.src.unistd.getgid + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher +) + add_libc_unittest( ftruncate_test SUITE diff --git a/libc/test/src/unistd/fchown_test.cpp b/libc/test/src/unistd/fchown_test.cpp new file mode 100644 index 0000000000000..7954410afb929 --- /dev/null +++ b/libc/test/src/unistd/fchown_test.cpp @@ -0,0 +1,50 @@ +//===-- Unittests for fchown ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/fcntl/open.h" +#include "src/unistd/close.h" +#include "src/unistd/fchown.h" +#include "src/unistd/getgid.h" +#include "src/unistd/getuid.h" +#include "src/unistd/unlink.h" + +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +#include "hdr/fcntl_macros.h" +#include + +using LlvmLibcFchownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcFchownTest, FchownSuccess) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + uid_t my_uid = LIBC_NAMESPACE::getuid(); + gid_t my_gid = LIBC_NAMESPACE::getgid(); + constexpr const char *FILENAME = "fchown.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + // Create a test file. + int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(write_fd, 0); + + // Change the ownership of the file. + ASSERT_THAT(LIBC_NAMESPACE::fchown(write_fd, my_uid, my_gid), Succeeds(0)); + + // Close the file descriptor. + ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0)); + + // Clean up the test file. + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); +} + +TEST_F(LlvmLibcFchownTest, FchownInvalidFileDescriptor) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; + ASSERT_THAT(LIBC_NAMESPACE::fchown(-1, 1000, 1000), Fails(EBADF)); +} diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 6d3036dfedddf..131ba99357d62 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -1066,7 +1066,6 @@ set(files sstream stack stdatomic.h - stdbool.h stddef.h stdexcept stdio.h diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index 27f60e0c0a055..7ca57f6455dd8 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -2437,10 +2437,6 @@ module std_stdatomic_h [system] { header "stdatomic.h" export * } -module std_stdbool_h [system] { - // 's __bool_true_false_are_defined macro requires textual inclusion. - textual header "stdbool.h" -} module std_stddef_h [system] { // supports being included multiple times with different pre-defined macros textual header "stddef.h" diff --git a/libcxx/include/stdbool.h b/libcxx/include/stdbool.h deleted file mode 100644 index 768d08247256a..0000000000000 --- a/libcxx/include/stdbool.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP_STDBOOL_H -#define _LIBCPP_STDBOOL_H - -/* - stdbool.h synopsis - -Macros: - - __bool_true_false_are_defined - -*/ - -#if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) -# include <__cxx03/__config> -#else -# include <__config> -#endif - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -#if __has_include_next() -# include_next -#endif - -#ifdef __cplusplus -# undef bool -# undef true -# undef false -# undef __bool_true_false_are_defined -# define __bool_true_false_are_defined 1 -#endif - -#endif // _LIBCPP_STDBOOL_H diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 20e4a1d755229..d0128d03a9eab 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -808,6 +808,17 @@ void ObjFile::parseSymbols(ArrayRef sectionHeaders, continue; if ((sym.n_type & N_TYPE) == N_SECT) { + if (sym.n_sect == 0) { + fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " + + toString(this) + " has an invalid section index [0]"); + } + if (sym.n_sect > sections.size()) { + fatal("section symbol " + StringRef(strtab + sym.n_strx) + " in " + + toString(this) + " has an invalid section index [" + + Twine(static_cast(sym.n_sect)) + + "] greater than the total number of sections [" + + Twine(sections.size()) + "]"); + } Subsections &subsections = sections[sym.n_sect - 1]->subsections; // parseSections() may have chosen not to parse this section. if (subsections.empty()) diff --git a/lld/test/MachO/handle-invalid-section-reference-too-big.test b/lld/test/MachO/handle-invalid-section-reference-too-big.test new file mode 100644 index 0000000000000..1642d63e50af4 --- /dev/null +++ b/lld/test/MachO/handle-invalid-section-reference-too-big.test @@ -0,0 +1,128 @@ +# REQUIRES: aarch64 + +## This is a regression test which makes sure that when there is an invalid section index +## associated with a section symbol, the linker does not segfault. + +## Test YAML content was created using the following steps +## 1. Create an object file from the following assembly +## `llvm-mc -filetype=obj -triple=arm64-apple-darwin symbol.s -o symbol.o` +## +## .text +## .section __TEST,__mystuff +## .globl _mysec +## _mysec: +## .byte 0xC3 +## +## 2. Use obj2yaml to convert object file to yaml +## `obj2yaml symbol.o -o symbol.yaml` +## +## 3. Manually set n_sect value of ltmp1 symbol to 10 which is greater than the number of sections 2. +## + +# RUN: yaml2obj %s -o %t +# RUN: not %lld -platform_version macos 10.14 11.0 -arch arm64 %t 2>&1 | FileCheck %s --check-prefix=FATAL + +# FATAL: error: section symbol ltmp0 in {{.*}} has an invalid section index [10] greater than the total number of sections [2] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x1 + ncmds: 3 + sizeofcmds: 336 + flags: 0x0 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 0 + vmsize: 1 + fileoff: 368 + filesize: 1 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 0 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x80000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '' + - sectname: __mystuff + segname: __TEST + addr: 0x0 + size: 1 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: C3 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 376 + nsyms: 3 + stroff: 424 + strsize: 24 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 2 + iextdefsym: 2 + nextdefsym: 1 + iundefsym: 3 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 14 + n_type: 0xE + n_sect: 10 + n_desc: 0 + n_value: 0 + - n_strx: 8 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 0 + - n_strx: 1 + n_type: 0xF + n_sect: 2 + n_desc: 0 + n_value: 0 + StringTable: + - '' + - _mysec + - ltmp1 + - ltmp0 + - '' + - '' + - '' + - '' +... diff --git a/lld/test/MachO/handle-invalid-section-reference-zero.test b/lld/test/MachO/handle-invalid-section-reference-zero.test new file mode 100644 index 0000000000000..ab636705198e5 --- /dev/null +++ b/lld/test/MachO/handle-invalid-section-reference-zero.test @@ -0,0 +1,128 @@ +# REQUIRES: aarch64 + +## This is a regression test which makes sure that when there is an invalid section index +## associated with a section symbol, the linker does not segfault. + +## Test YAML content was created using the following steps +## 1. Create an object file from the following assembly +## `llvm-mc -filetype=obj -triple=arm64-apple-darwin symbol.s -o symbol.o` +## +## .text +## .section __TEST,__mystuff +## .globl _mysec +## _mysec: +## .byte 0xC3 +## +## 2. Use obj2yaml to convert object file to yaml +## `obj2yaml symbol.o -o symbol.yaml` +## +## 3. Manually set n_sect value of ltmp1 symbol to 0 instead of 1. +## + +# RUN: yaml2obj %s -o %t +# RUN: not %lld -platform_version macos 10.14 11.0 -arch arm64 %t 2>&1 | FileCheck %s --check-prefix=FATAL + +# FATAL: error: section symbol ltmp0 in {{.*}} has an invalid section index [0] + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x1 + ncmds: 3 + sizeofcmds: 336 + flags: 0x0 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 232 + segname: '' + vmaddr: 0 + vmsize: 1 + fileoff: 368 + filesize: 1 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 0 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x80000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '' + - sectname: __mystuff + segname: __TEST + addr: 0x0 + size: 1 + offset: 0x170 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x0 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: C3 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 376 + nsyms: 3 + stroff: 424 + strsize: 24 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 2 + iextdefsym: 2 + nextdefsym: 1 + iundefsym: 3 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 14 + n_type: 0xE + n_sect: 0 + n_desc: 0 + n_value: 0 + - n_strx: 8 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 0 + - n_strx: 1 + n_type: 0xF + n_sect: 2 + n_desc: 0 + n_value: 0 + StringTable: + - '' + - _mysec + - ltmp1 + - ltmp0 + - '' + - '' + - '' + - '' +... diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 3a0995e84f643..84fb3a95c0942 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -422,6 +422,18 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject * return sb_ptr; } +void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBThread(PyObject * data) { + lldb::SBThread *sb_ptr = nullptr; + + int valid_cast = + SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBThread, 0); + + if (valid_cast == -1) + return NULL; + + return sb_ptr; +} + void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrame(PyObject * data) { lldb::SBFrame *sb_ptr = nullptr; diff --git a/lldb/examples/python/templates/scripted_frame_provider.py b/lldb/examples/python/templates/scripted_frame_provider.py index 20f4d76d188c2..7a72f1a24c9da 100644 --- a/lldb/examples/python/templates/scripted_frame_provider.py +++ b/lldb/examples/python/templates/scripted_frame_provider.py @@ -31,7 +31,54 @@ class ScriptedFrameProvider(metaclass=ABCMeta): ) """ + @staticmethod + def applies_to_thread(thread): + """Determine if this frame provider should be used for a given thread. + + This static method is called before creating an instance of the frame + provider to determine if it should be applied to a specific thread. + Override this method to provide custom filtering logic. + + Args: + thread (lldb.SBThread): The thread to check. + + Returns: + bool: True if this frame provider should be used for the thread, + False otherwise. The default implementation returns True for + all threads. + + Example: + + .. code-block:: python + + @staticmethod + def applies_to_thread(thread): + # Only apply to thread 1 + return thread.GetIndexID() == 1 + """ + return True + + @staticmethod @abstractmethod + def get_description(): + """Get a description of this frame provider. + + This method should return a human-readable string describing what + this frame provider does. The description is used for debugging + and display purposes. + + Returns: + str: A description of the frame provider. + + Example: + + .. code-block:: python + + def get_description(self): + return "Crash log frame provider for thread 1" + """ + pass + def __init__(self, input_frames, args): """Construct a scripted frame provider. diff --git a/lldb/examples/python/templates/scripted_process.py b/lldb/examples/python/templates/scripted_process.py index 49059d533f38a..136edce165140 100644 --- a/lldb/examples/python/templates/scripted_process.py +++ b/lldb/examples/python/templates/scripted_process.py @@ -245,6 +245,7 @@ def __init__(self, process, args): key/value pairs used by the scripted thread. """ self.target = None + self.arch = None self.originating_process = None self.process = None self.args = None @@ -266,6 +267,9 @@ def __init__(self, process, args): and process.IsValid() ): self.target = process.target + triple = self.target.triple + if triple: + self.arch = triple.split("-")[0] self.originating_process = process self.process = self.target.GetProcess() self.get_register_info() @@ -352,17 +356,14 @@ def get_stackframes(self): def get_register_info(self): if self.register_info is None: self.register_info = dict() - if "x86_64" in self.originating_process.arch: + if "x86_64" in self.arch: self.register_info["sets"] = ["General Purpose Registers"] self.register_info["registers"] = INTEL64_GPR - elif ( - "arm64" in self.originating_process.arch - or self.originating_process.arch == "aarch64" - ): + elif "arm64" in self.arch or self.arch == "aarch64": self.register_info["sets"] = ["General Purpose Registers"] self.register_info["registers"] = ARM64_GPR else: - raise ValueError("Unknown architecture", self.originating_process.arch) + raise ValueError("Unknown architecture", self.arch) return self.register_info @abstractmethod @@ -405,11 +406,12 @@ def __init__(self, thread, args): """Construct a scripted frame. Args: - thread (ScriptedThread): The thread owning this frame. + thread (ScriptedThread/lldb.SBThread): The thread owning this frame. args (lldb.SBStructuredData): A Dictionary holding arbitrary key/value pairs used by the scripted frame. """ self.target = None + self.arch = None self.originating_thread = None self.thread = None self.args = None @@ -419,15 +421,17 @@ def __init__(self, thread, args): self.register_ctx = {} self.variables = [] - if ( - isinstance(thread, ScriptedThread) - or isinstance(thread, lldb.SBThread) - and thread.IsValid() + if isinstance(thread, ScriptedThread) or ( + isinstance(thread, lldb.SBThread) and thread.IsValid() ): - self.target = thread.target self.process = thread.process + self.target = self.process.target + triple = self.target.triple + if triple: + self.arch = triple.split("-")[0] + tid = thread.tid if isinstance(thread, ScriptedThread) else thread.id self.originating_thread = thread - self.thread = self.process.GetThreadByIndexID(thread.tid) + self.thread = self.process.GetThreadByIndexID(tid) self.get_register_info() @abstractmethod @@ -508,7 +512,18 @@ def get_variables(self, filters): def get_register_info(self): if self.register_info is None: - self.register_info = self.originating_thread.get_register_info() + if isinstance(self.originating_thread, ScriptedThread): + self.register_info = self.originating_thread.get_register_info() + elif isinstance(self.originating_thread, lldb.SBThread): + self.register_info = dict() + if "x86_64" in self.arch: + self.register_info["sets"] = ["General Purpose Registers"] + self.register_info["registers"] = INTEL64_GPR + elif "arm64" in self.arch or self.arch == "aarch64": + self.register_info["sets"] = ["General Purpose Registers"] + self.register_info["registers"] = ARM64_GPR + else: + raise ValueError("Unknown architecture", self.arch) return self.register_info @abstractmethod @@ -642,12 +657,12 @@ def get_stop_reason(self): # TODO: Passthrough stop reason from driving process if self.driving_thread.GetStopReason() != lldb.eStopReasonNone: - if "arm64" in self.originating_process.arch: + if "arm64" in self.arch: stop_reason["type"] = lldb.eStopReasonException stop_reason["data"]["desc"] = ( self.driving_thread.GetStopDescription(100) ) - elif self.originating_process.arch == "x86_64": + elif self.arch == "x86_64": stop_reason["type"] = lldb.eStopReasonSignal stop_reason["data"]["signal"] = signal.SIGTRAP else: diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h index 379a0bb7e9513..c2fd3e2f50e3b 100644 --- a/lldb/include/lldb/API/SBTarget.h +++ b/lldb/include/lldb/API/SBTarget.h @@ -19,6 +19,7 @@ #include "lldb/API/SBLaunchInfo.h" #include "lldb/API/SBStatisticsOptions.h" #include "lldb/API/SBSymbolContextList.h" +#include "lldb/API/SBThreadCollection.h" #include "lldb/API/SBType.h" #include "lldb/API/SBValue.h" #include "lldb/API/SBWatchpoint.h" @@ -986,6 +987,35 @@ class LLDB_API SBTarget { lldb::SBMutex GetAPIMutex() const; + /// Register a scripted frame provider for this target. + /// If a scripted frame provider with the same name and same argument + /// dictionary is already registered on this target, it will be overwritten. + /// + /// \param[in] class_name + /// The name of the Python class that implements the frame provider. + /// + /// \param[in] args_dict + /// A dictionary of arguments to pass to the frame provider class. + /// + /// \param[out] error + /// An error object indicating success or failure. + /// + /// \return + /// A unique identifier for the frame provider descriptor that was + /// registered. 0 if the registration failed. + uint32_t RegisterScriptedFrameProvider(const char *class_name, + lldb::SBStructuredData args_dict, + lldb::SBError &error); + + /// Remove a scripted frame provider from this target by name. + /// + /// \param[in] provider_id + /// The id of the frame provider class to remove. + /// + /// \return + /// An error object indicating success or failure. + lldb::SBError RemoveScriptedFrameProvider(uint32_t provider_id); + protected: friend class SBAddress; friend class SBAddressRange; diff --git a/lldb/include/lldb/API/SBThread.h b/lldb/include/lldb/API/SBThread.h index f6a6d19935b83..639e7a0a1a5c0 100644 --- a/lldb/include/lldb/API/SBThread.h +++ b/lldb/include/lldb/API/SBThread.h @@ -256,6 +256,7 @@ class LLDB_API SBThread { friend class SBThreadPlan; friend class SBTrace; + friend class lldb_private::ScriptInterpreter; friend class lldb_private::python::SWIGBridge; SBThread(const lldb::ThreadSP &lldb_object_sp); diff --git a/lldb/include/lldb/API/SBThreadCollection.h b/lldb/include/lldb/API/SBThreadCollection.h index 5a052e6246026..d13dea0f11cd2 100644 --- a/lldb/include/lldb/API/SBThreadCollection.h +++ b/lldb/include/lldb/API/SBThreadCollection.h @@ -46,6 +46,7 @@ class LLDB_API SBThreadCollection { void SetOpaque(const lldb::ThreadCollectionSP &threads); private: + friend class SBTarget; friend class SBProcess; friend class SBThread; friend class SBSaveCoreOptions; diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h index 2d9f713676f90..49b60131399d5 100644 --- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h +++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h @@ -16,11 +16,29 @@ namespace lldb_private { class ScriptedFrameProviderInterface : public ScriptedInterface { public: + virtual bool AppliesToThread(llvm::StringRef class_name, + lldb::ThreadSP thread_sp) { + return true; + } + virtual llvm::Expected CreatePluginObject(llvm::StringRef class_name, lldb::StackFrameListSP input_frames, StructuredData::DictionarySP args_sp) = 0; + /// Get a description string for the frame provider. + /// + /// This is called by the descriptor to fetch a description from the + /// scripted implementation. Implementations should call a static method + /// on the scripting class to retrieve the description. + /// + /// \param class_name The name of the scripting class implementing the + /// provider. + /// + /// \return A string describing what this frame provider does, or an + /// empty string if no description is available. + virtual std::string GetDescription(llvm::StringRef class_name) { return {}; } + virtual StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) { return {}; } diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index 7fed4940b85bf..0b91d6756552d 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -21,6 +21,7 @@ #include "lldb/API/SBMemoryRegionInfo.h" #include "lldb/API/SBStream.h" #include "lldb/API/SBSymbolContext.h" +#include "lldb/API/SBThread.h" #include "lldb/Breakpoint/BreakpointOptions.h" #include "lldb/Core/PluginInterface.h" #include "lldb/Core/SearchFilter.h" @@ -580,6 +581,8 @@ class ScriptInterpreter : public PluginInterface { lldb::StreamSP GetOpaqueTypeFromSBStream(const lldb::SBStream &stream) const; + lldb::ThreadSP GetOpaqueTypeFromSBThread(const lldb::SBThread &exe_ctx) const; + lldb::StackFrameSP GetOpaqueTypeFromSBFrame(const lldb::SBFrame &frame) const; SymbolContext diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h index cdbe8ae3c6779..af2e49b4a67da 100644 --- a/lldb/include/lldb/Target/StackFrame.h +++ b/lldb/include/lldb/Target/StackFrame.h @@ -441,8 +441,11 @@ class StackFrame : public ExecutionContextScope, /// frames are included in this frame index count. uint32_t GetFrameIndex() const; - /// Set this frame's synthetic frame index. - void SetFrameIndex(uint32_t index) { m_frame_index = index; } + /// Set this frame's frame index. + void SetFrameIndex(uint32_t index) { + m_frame_index = index; + m_concrete_frame_index = index; + } /// Query this frame to find what frame it is in this Thread's /// StackFrameList, not counting inlined frames. diff --git a/lldb/include/lldb/Target/StackFrameList.h b/lldb/include/lldb/Target/StackFrameList.h index 5b0df0ddb3e29..539c070ff0f4b 100644 --- a/lldb/include/lldb/Target/StackFrameList.h +++ b/lldb/include/lldb/Target/StackFrameList.h @@ -20,13 +20,13 @@ namespace lldb_private { class ScriptedThread; -class StackFrameList { +class StackFrameList : public std::enable_shared_from_this { public: // Constructors and Destructors StackFrameList(Thread &thread, const lldb::StackFrameListSP &prev_frames_sp, bool show_inline_frames); - ~StackFrameList(); + virtual ~StackFrameList(); /// Get the number of visible frames. Frames may be created if \p can_create /// is true. Synthetic (inline) frames expanded from the concrete frame #0 @@ -106,6 +106,7 @@ class StackFrameList { protected: friend class Thread; + friend class ScriptedFrameProvider; friend class ScriptedThread; /// Use this API to build a stack frame list (used for scripted threads, for @@ -211,19 +212,23 @@ class StackFrameList { /// Whether or not to show synthetic (inline) frames. Immutable. const bool m_show_inlined_frames; + /// Returns true if fetching frames was interrupted, false otherwise. + virtual bool FetchFramesUpTo(uint32_t end_idx, + InterruptionControl allow_interrupt); + private: uint32_t SetSelectedFrameNoLock(lldb_private::StackFrame *frame); lldb::StackFrameSP GetFrameAtIndexNoLock(uint32_t idx, std::shared_lock &guard); + /// @{ /// These two Fetch frames APIs and SynthesizeTailCallFrames are called in /// GetFramesUpTo, they are the ones that actually add frames. They must be /// called with the writer end of the list mutex held. - - /// Returns true if fetching frames was interrupted, false otherwise. - bool FetchFramesUpTo(uint32_t end_idx, InterruptionControl allow_interrupt); + /// /// Not currently interruptible so returns void. + /// }@ void FetchOnlyConcreteFramesUpTo(uint32_t end_idx); void SynthesizeTailCallFrames(StackFrame &next_frame); @@ -231,6 +236,27 @@ class StackFrameList { const StackFrameList &operator=(const StackFrameList &) = delete; }; +/// A StackFrameList that wraps another StackFrameList and uses a +/// SyntheticFrameProvider to lazily provide frames from either the provider +/// or the underlying real stack frame list. +class SyntheticStackFrameList : public StackFrameList { +public: + SyntheticStackFrameList(Thread &thread, lldb::StackFrameListSP input_frames, + const lldb::StackFrameListSP &prev_frames_sp, + bool show_inline_frames); + +protected: + /// Override FetchFramesUpTo to lazily return frames from the provider + /// or from the actual stack frame list. + bool FetchFramesUpTo(uint32_t end_idx, + InterruptionControl allow_interrupt) override; + +private: + /// The input stack frame list that the provider transforms. + /// This could be a real StackFrameList or another SyntheticStackFrameList. + lldb::StackFrameListSP m_input_frames; +}; + } // namespace lldb_private #endif // LLDB_TARGET_STACKFRAMELIST_H diff --git a/lldb/include/lldb/Target/SyntheticFrameProvider.h b/lldb/include/lldb/Target/SyntheticFrameProvider.h index 61a492f356ece..2d5330cb03105 100644 --- a/lldb/include/lldb/Target/SyntheticFrameProvider.h +++ b/lldb/include/lldb/Target/SyntheticFrameProvider.h @@ -24,22 +24,25 @@ namespace lldb_private { /// This struct contains the metadata needed to instantiate a frame provider /// and optional filters to control which threads it applies to. -struct SyntheticFrameProviderDescriptor { +struct ScriptedFrameProviderDescriptor { /// Metadata for instantiating the provider (e.g. script class name and args). lldb::ScriptedMetadataSP scripted_metadata_sp; + /// Interface for calling static methods on the provider class. + lldb::ScriptedFrameProviderInterfaceSP interface_sp; + /// Optional list of thread specifications to which this provider applies. /// If empty, the provider applies to all threads. A thread matches if it /// satisfies ANY of the specs in this vector (OR logic). std::vector thread_specs; - SyntheticFrameProviderDescriptor() = default; + ScriptedFrameProviderDescriptor() = default; - SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp) + ScriptedFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp) : scripted_metadata_sp(metadata_sp) {} - SyntheticFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp, - const std::vector &specs) + ScriptedFrameProviderDescriptor(lldb::ScriptedMetadataSP metadata_sp, + const std::vector &specs) : scripted_metadata_sp(metadata_sp), thread_specs(specs) {} /// Get the name of this descriptor (the scripted class name). @@ -47,6 +50,12 @@ struct SyntheticFrameProviderDescriptor { return scripted_metadata_sp ? scripted_metadata_sp->GetClassName() : ""; } + /// Get the description of this frame provider. + /// + /// \return A string describing what this frame provider does, or an + /// empty string if no description is available. + std::string GetDescription() const; + /// Check if this descriptor applies to the given thread. bool AppliesToThread(Thread &thread) const { // If no thread specs specified, applies to all threads. @@ -64,6 +73,13 @@ struct SyntheticFrameProviderDescriptor { /// Check if this descriptor has valid metadata for script-based providers. bool IsValid() const { return scripted_metadata_sp != nullptr; } + /// Get a unique identifier for this descriptor based on its contents. + /// The ID is computed from the class name and arguments dictionary, + /// not from the pointer address, so two descriptors with the same + /// contents will have the same ID. + uint32_t GetID() const; + + /// Dump a description of this descriptor to the given stream. void Dump(Stream *s) const; }; @@ -95,7 +111,7 @@ class SyntheticFrameProvider : public PluginInterface { /// otherwise an \a llvm::Error. static llvm::Expected CreateInstance(lldb::StackFrameListSP input_frames, - const SyntheticFrameProviderDescriptor &descriptor); + const ScriptedFrameProviderDescriptor &descriptor); /// Try to create a SyntheticFrameProvider instance for the given input /// frames using a specific C++ plugin. @@ -125,6 +141,8 @@ class SyntheticFrameProvider : public PluginInterface { ~SyntheticFrameProvider() override; + virtual std::string GetDescription() const = 0; + /// Get a single stack frame at the specified index. /// /// This method is called lazily - frames are only created when requested. diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index 40f9c9bea1c12..cbda1c835f6bd 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -32,6 +32,7 @@ #include "lldb/Target/PathMappingList.h" #include "lldb/Target/SectionLoadHistory.h" #include "lldb/Target/Statistics.h" +#include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Target/ThreadSpec.h" #include "lldb/Utility/ArchSpec.h" #include "lldb/Utility/Broadcaster.h" @@ -697,6 +698,36 @@ class Target : public std::enable_shared_from_this, Status Attach(ProcessAttachInfo &attach_info, Stream *stream); // Optional stream to receive first stop info + /// Add or update a scripted frame provider descriptor for this target. + /// All new threads in this target will check if they match any descriptors + /// to create their frame providers. + /// + /// \param[in] descriptor + /// The descriptor to add or update. + /// + /// \return + /// The descriptor identifier if the registration succeeded, otherwise an + /// llvm::Error. + llvm::Expected AddScriptedFrameProviderDescriptor( + const ScriptedFrameProviderDescriptor &descriptor); + + /// Remove a scripted frame provider descriptor by id. + /// + /// \param[in] id + /// The id of the descriptor to remove. + /// + /// \return + /// True if a descriptor was removed, false if no descriptor with that + /// id existed. + bool RemoveScriptedFrameProviderDescriptor(uint32_t id); + + /// Clear all scripted frame provider descriptors for this target. + void ClearScriptedFrameProviderDescriptors(); + + /// Get all scripted frame provider descriptors for this target. + const llvm::DenseMap & + GetScriptedFrameProviderDescriptors() const; + // This part handles the breakpoints. BreakpointList &GetBreakpointList(bool internal = false); @@ -1689,6 +1720,13 @@ class Target : public std::enable_shared_from_this, PathMappingList m_image_search_paths; TypeSystemMap m_scratch_type_system_map; + /// Map of scripted frame provider descriptors for this target. + /// Keys are the provider descriptors ids, values are the descriptors. + /// Used to initialize frame providers for new threads. + llvm::DenseMap + m_frame_provider_descriptors; + mutable std::recursive_mutex m_frame_provider_descriptors_mutex; + typedef std::map REPLMap; REPLMap m_repl_map; diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h index 841f80cd1b1eb..46ce192556756 100644 --- a/lldb/include/lldb/Target/Thread.h +++ b/lldb/include/lldb/Target/Thread.h @@ -1297,6 +1297,15 @@ class Thread : public std::enable_shared_from_this, lldb::StackFrameListSP GetStackFrameList(); + llvm::Error + LoadScriptedFrameProvider(const ScriptedFrameProviderDescriptor &descriptor); + + void ClearScriptedFrameProvider(); + + lldb::SyntheticFrameProviderSP GetFrameProvider() const { + return m_frame_provider_sp; + } + protected: friend class ThreadPlan; friend class ThreadList; @@ -1400,6 +1409,9 @@ class Thread : public std::enable_shared_from_this, /// The Thread backed by this thread, if any. lldb::ThreadWP m_backed_thread; + /// The Scripted Frame Provider, if any. + lldb::SyntheticFrameProviderSP m_frame_provider_sp; + private: bool m_extended_info_fetched; // Have we tried to retrieve the m_extended_info // for this thread? diff --git a/lldb/include/lldb/Target/ThreadSpec.h b/lldb/include/lldb/Target/ThreadSpec.h index 7c7c832741196..63f8f8b5ec181 100644 --- a/lldb/include/lldb/Target/ThreadSpec.h +++ b/lldb/include/lldb/Target/ThreadSpec.h @@ -34,6 +34,8 @@ class ThreadSpec { public: ThreadSpec(); + ThreadSpec(Thread &thread); + static std::unique_ptr CreateFromStructuredData(const StructuredData::Dictionary &data_dict, Status &error); diff --git a/lldb/include/lldb/Utility/ScriptedMetadata.h b/lldb/include/lldb/Utility/ScriptedMetadata.h index 69c83edce909a..8523c95429718 100644 --- a/lldb/include/lldb/Utility/ScriptedMetadata.h +++ b/lldb/include/lldb/Utility/ScriptedMetadata.h @@ -10,7 +10,9 @@ #define LLDB_INTERPRETER_SCRIPTEDMETADATA_H #include "lldb/Utility/ProcessInfo.h" +#include "lldb/Utility/StreamString.h" #include "lldb/Utility/StructuredData.h" +#include "llvm/ADT/Hashing.h" namespace lldb_private { class ScriptedMetadata { @@ -27,11 +29,36 @@ class ScriptedMetadata { } } + ScriptedMetadata(const ScriptedMetadata &other) + : m_class_name(other.m_class_name), m_args_sp(other.m_args_sp) {} + explicit operator bool() const { return !m_class_name.empty(); } llvm::StringRef GetClassName() const { return m_class_name; } StructuredData::DictionarySP GetArgsSP() const { return m_args_sp; } + /// Get a unique identifier for this metadata based on its contents. + /// The ID is computed from the class name and arguments dictionary, + /// not from the pointer address, so two metadata objects with the same + /// contents will have the same ID. + uint32_t GetID() const { + if (m_class_name.empty()) + return 0; + + // Hash the class name. + llvm::hash_code hash = llvm::hash_value(m_class_name); + + // Hash the arguments dictionary if present. + if (m_args_sp) { + StreamString ss; + m_args_sp->GetDescription(ss); + hash = llvm::hash_combine(hash, llvm::hash_value(ss.GetData())); + } + + // Return the lower 32 bits of the hash. + return static_cast(hash); + } + private: std::string m_class_name; StructuredData::DictionarySP m_args_sp; diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h index 5fc5c14c52f9e..52806eea190a7 100644 --- a/lldb/include/lldb/lldb-private-interfaces.h +++ b/lldb/include/lldb/lldb-private-interfaces.h @@ -26,7 +26,7 @@ class Value; namespace lldb_private { class ScriptedInterfaceUsages; -struct SyntheticFrameProviderDescriptor; +struct ScriptedFrameProviderDescriptor; typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp, const ArchSpec &arch); typedef std::unique_ptr (*ArchitectureCreateInstance)( @@ -91,7 +91,7 @@ typedef lldb::ScriptInterpreterSP (*ScriptInterpreterCreateInstance)( typedef llvm::Expected ( *ScriptedFrameProviderCreateInstance)( lldb::StackFrameListSP input_frames, - const lldb_private::SyntheticFrameProviderDescriptor &descriptor); + const lldb_private::ScriptedFrameProviderDescriptor &descriptor); typedef llvm::Expected ( *SyntheticFrameProviderCreateInstance)( lldb::StackFrameListSP input_frames, diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp index 98d10aa07c53f..bb1d98b6e15c1 100644 --- a/lldb/source/API/SBTarget.cpp +++ b/lldb/source/API/SBTarget.cpp @@ -23,6 +23,7 @@ #include "lldb/API/SBStringList.h" #include "lldb/API/SBStructuredData.h" #include "lldb/API/SBSymbolContextList.h" +#include "lldb/API/SBThreadCollection.h" #include "lldb/API/SBTrace.h" #include "lldb/Breakpoint/BreakpointID.h" #include "lldb/Breakpoint/BreakpointIDList.h" @@ -39,6 +40,7 @@ #include "lldb/Core/Section.h" #include "lldb/Core/StructuredDataImpl.h" #include "lldb/Host/Host.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include "lldb/Symbol/DeclVendor.h" #include "lldb/Symbol/ObjectFile.h" #include "lldb/Symbol/SymbolFile.h" @@ -50,6 +52,7 @@ #include "lldb/Target/LanguageRuntime.h" #include "lldb/Target/Process.h" #include "lldb/Target/StackFrame.h" +#include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Target/Target.h" #include "lldb/Target/TargetList.h" #include "lldb/Utility/ArchSpec.h" @@ -59,6 +62,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/ProcessInfo.h" #include "lldb/Utility/RegularExpression.h" +#include "lldb/Utility/ScriptedMetadata.h" #include "lldb/ValueObject/ValueObjectConstResult.h" #include "lldb/ValueObject/ValueObjectList.h" #include "lldb/ValueObject/ValueObjectVariable.h" @@ -2408,3 +2412,81 @@ lldb::SBMutex SBTarget::GetAPIMutex() const { return lldb::SBMutex(target_sp); return lldb::SBMutex(); } + +uint32_t +SBTarget::RegisterScriptedFrameProvider(const char *class_name, + lldb::SBStructuredData args_dict, + lldb::SBError &error) { + LLDB_INSTRUMENT_VA(this, class_name, args_dict, error); + + TargetSP target_sp = GetSP(); + if (!target_sp) { + error.SetErrorString("invalid target"); + return 0; + } + + if (!class_name || !class_name[0]) { + error.SetErrorString("invalid class name"); + return 0; + } + + // Extract the dictionary from SBStructuredData. + StructuredData::DictionarySP dict_sp; + if (args_dict.IsValid() && args_dict.m_impl_up) { + StructuredData::ObjectSP obj_sp = args_dict.m_impl_up->GetObjectSP(); + if (obj_sp && obj_sp->GetType() != lldb::eStructuredDataTypeDictionary) { + error.SetErrorString("SBStructuredData argument isn't a dictionary"); + return 0; + } + dict_sp = std::make_shared(obj_sp); + } + + // Create the ScriptedMetadata. + ScriptedMetadataSP metadata_sp = + std::make_shared(class_name, dict_sp); + + // Create the interface for calling static methods. + ScriptedFrameProviderInterfaceSP interface_sp = + target_sp->GetDebugger() + .GetScriptInterpreter() + ->CreateScriptedFrameProviderInterface(); + + // Create a descriptor (applies to all threads by default). + ScriptedFrameProviderDescriptor descriptor(metadata_sp); + descriptor.interface_sp = interface_sp; + + llvm::Expected descriptor_id_or_err = + target_sp->AddScriptedFrameProviderDescriptor(descriptor); + if (!descriptor_id_or_err) { + error.SetErrorString( + llvm::toString(descriptor_id_or_err.takeError()).c_str()); + return 0; + } + + // Register the descriptor with the target. + return *descriptor_id_or_err; +} + +lldb::SBError SBTarget::RemoveScriptedFrameProvider(uint32_t provider_id) { + LLDB_INSTRUMENT_VA(this, provider_id); + + SBError error; + TargetSP target_sp = GetSP(); + if (!target_sp) { + error.SetErrorString("invalid target"); + return error; + } + + if (!provider_id) { + error.SetErrorString("invalid provider id"); + return error; + } + + if (!target_sp->RemoveScriptedFrameProviderDescriptor(provider_id)) { + error.SetErrorStringWithFormat("no frame provider named '%u' found", + provider_id); + return error; + } + + return {}; +} diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 30bca639060e6..86373f5280271 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -51,6 +51,7 @@ #include "lldb/Utility/ConstString.h" #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/ScriptedMetadata.h" #include "lldb/Utility/State.h" #include "lldb/Utility/Stream.h" #include "lldb/Utility/StructuredData.h" @@ -5401,6 +5402,202 @@ class CommandObjectTargetDump : public CommandObjectMultiword { ~CommandObjectTargetDump() override = default; }; +#pragma mark CommandObjectTargetFrameProvider + +#define LLDB_OPTIONS_target_frame_provider_register +#include "CommandOptions.inc" + +class CommandObjectTargetFrameProviderRegister : public CommandObjectParsed { +public: + CommandObjectTargetFrameProviderRegister(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target frame-provider register", + "Register frame provider for all threads in this target.", nullptr, + eCommandRequiresTarget), + + m_class_options("target frame-provider", true, 'C', 'k', 'v', 0) { + m_all_options.Append(&m_class_options, LLDB_OPT_SET_1 | LLDB_OPT_SET_2, + LLDB_OPT_SET_ALL); + m_all_options.Finalize(); + + AddSimpleArgumentList(eArgTypeRunArgs, eArgRepeatOptional); + } + + ~CommandObjectTargetFrameProviderRegister() override = default; + + Options *GetOptions() override { return &m_all_options; } + + std::optional GetRepeatCommand(Args ¤t_command_args, + uint32_t index) override { + return std::string(""); + } + +protected: + void DoExecute(Args &launch_args, CommandReturnObject &result) override { + ScriptedMetadataSP metadata_sp = std::make_shared( + m_class_options.GetName(), m_class_options.GetStructuredData()); + + Target *target = m_exe_ctx.GetTargetPtr(); + if (!target) + target = &GetDebugger().GetDummyTarget(); + + // Create the interface for calling static methods. + ScriptedFrameProviderInterfaceSP interface_sp = + GetDebugger() + .GetScriptInterpreter() + ->CreateScriptedFrameProviderInterface(); + + // Create a descriptor from the metadata (applies to all threads by + // default). + ScriptedFrameProviderDescriptor descriptor(metadata_sp); + descriptor.interface_sp = interface_sp; + + auto id_or_err = target->AddScriptedFrameProviderDescriptor(descriptor); + if (!id_or_err) { + result.SetError(id_or_err.takeError()); + return; + } + + result.AppendMessageWithFormat( + "successfully registered scripted frame provider '%s' for target\n", + m_class_options.GetName().c_str()); + } + + OptionGroupPythonClassWithDict m_class_options; + OptionGroupOptions m_all_options; +}; + +class CommandObjectTargetFrameProviderClear : public CommandObjectParsed { +public: + CommandObjectTargetFrameProviderClear(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target frame-provider clear", + "Clear all registered frame providers from this target.", nullptr, + eCommandRequiresTarget) {} + + ~CommandObjectTargetFrameProviderClear() override = default; + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target *target = m_exe_ctx.GetTargetPtr(); + if (!target) { + result.AppendError("invalid target"); + return; + } + + target->ClearScriptedFrameProviderDescriptors(); + + result.SetStatus(eReturnStatusSuccessFinishResult); + } +}; + +class CommandObjectTargetFrameProviderList : public CommandObjectParsed { +public: + CommandObjectTargetFrameProviderList(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target frame-provider list", + "List all registered frame providers for the target.", nullptr, + eCommandRequiresTarget) {} + + ~CommandObjectTargetFrameProviderList() override = default; + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target *target = m_exe_ctx.GetTargetPtr(); + if (!target) + target = &GetDebugger().GetDummyTarget(); + + const auto &descriptors = target->GetScriptedFrameProviderDescriptors(); + if (descriptors.empty()) { + result.AppendMessage("no frame providers registered for this target."); + result.SetStatus(eReturnStatusSuccessFinishResult); + return; + } + + result.AppendMessageWithFormat("%u frame provider(s) registered:\n\n", + descriptors.size()); + + for (const auto &entry : descriptors) { + const ScriptedFrameProviderDescriptor &descriptor = entry.second; + descriptor.Dump(&result.GetOutputStream()); + result.GetOutputStream().PutChar('\n'); + } + + result.SetStatus(eReturnStatusSuccessFinishResult); + } +}; + +class CommandObjectTargetFrameProviderRemove : public CommandObjectParsed { +public: + CommandObjectTargetFrameProviderRemove(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target frame-provider remove", + "Remove a registered frame provider from the target by id.", + "target frame-provider remove ", + eCommandRequiresTarget) { + AddSimpleArgumentList(eArgTypeUnsignedInteger, eArgRepeatPlus); + } + + ~CommandObjectTargetFrameProviderRemove() override = default; + +protected: + void DoExecute(Args &command, CommandReturnObject &result) override { + Target *target = m_exe_ctx.GetTargetPtr(); + if (!target) + target = &GetDebugger().GetDummyTarget(); + + std::vector removed_provider_ids; + for (size_t i = 0; i < command.GetArgumentCount(); i++) { + uint32_t provider_id = 0; + if (!llvm::to_integer(command[i].ref(), provider_id)) { + result.AppendError("target frame-provider remove requires integer " + "provider id argument"); + return; + } + + if (!target->RemoveScriptedFrameProviderDescriptor(provider_id)) { + result.AppendErrorWithFormat( + "no frame provider named '%u' found in target\n", provider_id); + return; + } + removed_provider_ids.push_back(provider_id); + } + + if (size_t num_removed_providers = removed_provider_ids.size()) { + result.AppendMessageWithFormat( + "Successfully removed %zu frame-providers.\n", num_removed_providers); + result.SetStatus(eReturnStatusSuccessFinishNoResult); + } else { + result.AppendError("0 frame providers removed.\n"); + } + } +}; + +class CommandObjectTargetFrameProvider : public CommandObjectMultiword { +public: + CommandObjectTargetFrameProvider(CommandInterpreter &interpreter) + : CommandObjectMultiword( + interpreter, "target frame-provider", + "Commands for registering and viewing frame providers for the " + "target.", + "target frame-provider [] ") { + LoadSubCommand("register", + CommandObjectSP(new CommandObjectTargetFrameProviderRegister( + interpreter))); + LoadSubCommand("clear", + CommandObjectSP( + new CommandObjectTargetFrameProviderClear(interpreter))); + LoadSubCommand( + "list", + CommandObjectSP(new CommandObjectTargetFrameProviderList(interpreter))); + LoadSubCommand( + "remove", CommandObjectSP( + new CommandObjectTargetFrameProviderRemove(interpreter))); + } + + ~CommandObjectTargetFrameProvider() override = default; +}; + #pragma mark CommandObjectMultiwordTarget // CommandObjectMultiwordTarget @@ -5416,6 +5613,9 @@ CommandObjectMultiwordTarget::CommandObjectMultiwordTarget( CommandObjectSP(new CommandObjectTargetDelete(interpreter))); LoadSubCommand("dump", CommandObjectSP(new CommandObjectTargetDump(interpreter))); + LoadSubCommand( + "frame-provider", + CommandObjectSP(new CommandObjectTargetFrameProvider(interpreter))); LoadSubCommand("list", CommandObjectSP(new CommandObjectTargetList(interpreter))); LoadSubCommand("select", diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 211868b51facb..69d8607a873f3 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -106,6 +106,13 @@ ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const { return Status(); } +lldb::ThreadSP ScriptInterpreter::GetOpaqueTypeFromSBThread( + const lldb::SBThread &thread) const { + if (thread.m_opaque_sp) + return thread.m_opaque_sp->GetThreadSP(); + return nullptr; +} + lldb::StackFrameSP ScriptInterpreter::GetOpaqueTypeFromSBFrame(const lldb::SBFrame &frame) const { if (frame.m_opaque_sp) diff --git a/lldb/source/Plugins/CMakeLists.txt b/lldb/source/Plugins/CMakeLists.txt index 08f444e7b15e8..b6878b21ff71a 100644 --- a/lldb/source/Plugins/CMakeLists.txt +++ b/lldb/source/Plugins/CMakeLists.txt @@ -22,6 +22,7 @@ add_subdirectory(SymbolFile) add_subdirectory(SystemRuntime) add_subdirectory(SymbolLocator) add_subdirectory(SymbolVendor) +add_subdirectory(SyntheticFrameProvider) add_subdirectory(Trace) add_subdirectory(TraceExporter) add_subdirectory(TypeSystem) diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp index 9beb133f5595f..9fff4adbff79d 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp @@ -537,7 +537,7 @@ __lldb_apple_objc_v2_get_shared_cache_class_info (void *objc_opt_ro_ptr, for (uint32_t i=0; iversion >= 12 && objc_opt->version <= 15) diff --git a/lldb/source/Plugins/Process/scripted/ScriptedFrame.cpp b/lldb/source/Plugins/Process/scripted/ScriptedFrame.cpp index 6519df9185df0..53d0c22e62ad7 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedFrame.cpp +++ b/lldb/source/Plugins/Process/scripted/ScriptedFrame.cpp @@ -7,8 +7,22 @@ //===----------------------------------------------------------------------===// #include "ScriptedFrame.h" - +#include "Plugins/Process/Utility/RegisterContextMemory.h" + +#include "lldb/Core/Address.h" +#include "lldb/Core/Debugger.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameInterface.h" +#include "lldb/Interpreter/Interfaces/ScriptedThreadInterface.h" +#include "lldb/Interpreter/ScriptInterpreter.h" +#include "lldb/Symbol/SymbolContext.h" +#include "lldb/Target/ExecutionContext.h" +#include "lldb/Target/Process.h" +#include "lldb/Target/RegisterContext.h" +#include "lldb/Target/Thread.h" #include "lldb/Utility/DataBufferHeap.h" +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" +#include "lldb/Utility/StructuredData.h" using namespace lldb; using namespace lldb_private; @@ -19,30 +33,44 @@ void ScriptedFrame::CheckInterpreterAndScriptObject() const { } llvm::Expected> -ScriptedFrame::Create(ScriptedThread &thread, +ScriptedFrame::Create(ThreadSP thread_sp, + ScriptedThreadInterfaceSP scripted_thread_interface_sp, StructuredData::DictionarySP args_sp, StructuredData::Generic *script_object) { - if (!thread.IsValid()) - return llvm::createStringError("Invalid scripted thread."); + if (!thread_sp || !thread_sp->IsValid()) + return llvm::createStringError("invalid thread"); + + ProcessSP process_sp = thread_sp->GetProcess(); + if (!process_sp || !process_sp->IsValid()) + return llvm::createStringError("invalid process"); - thread.CheckInterpreterAndScriptObject(); + ScriptInterpreter *script_interp = + process_sp->GetTarget().GetDebugger().GetScriptInterpreter(); + if (!script_interp) + return llvm::createStringError("no script interpreter"); - auto scripted_frame_interface = - thread.GetInterface()->CreateScriptedFrameInterface(); + auto scripted_frame_interface = script_interp->CreateScriptedFrameInterface(); if (!scripted_frame_interface) return llvm::createStringError("failed to create scripted frame interface"); llvm::StringRef frame_class_name; if (!script_object) { - std::optional class_name = - thread.GetInterface()->GetScriptedFramePluginName(); - if (!class_name || class_name->empty()) + // If no script object is provided and we have a scripted thread interface, + // try to get the frame class name from it. + if (scripted_thread_interface_sp) { + std::optional class_name = + scripted_thread_interface_sp->GetScriptedFramePluginName(); + if (!class_name || class_name->empty()) + return llvm::createStringError( + "failed to get scripted frame class name"); + frame_class_name = *class_name; + } else { return llvm::createStringError( - "failed to get scripted thread class name"); - frame_class_name = *class_name; + "no script object provided and no scripted thread interface"); + } } - ExecutionContext exe_ctx(thread); + ExecutionContext exe_ctx(thread_sp); auto obj_or_err = scripted_frame_interface->CreatePluginObject( frame_class_name, exe_ctx, args_sp, script_object); @@ -62,7 +90,7 @@ ScriptedFrame::Create(ScriptedThread &thread, SymbolContext sc; Address symbol_addr; if (pc != LLDB_INVALID_ADDRESS) { - symbol_addr.SetLoadAddress(pc, &thread.GetProcess()->GetTarget()); + symbol_addr.SetLoadAddress(pc, &process_sp->GetTarget()); symbol_addr.CalculateSymbolContext(&sc); } @@ -77,11 +105,11 @@ ScriptedFrame::Create(ScriptedThread &thread, if (!reg_info) return llvm::createStringError( - "failed to get scripted thread registers info"); + "failed to get scripted frame registers info"); std::shared_ptr register_info_sp = - DynamicRegisterInfo::Create( - *reg_info, thread.GetProcess()->GetTarget().GetArchitecture()); + DynamicRegisterInfo::Create(*reg_info, + process_sp->GetTarget().GetArchitecture()); lldb::RegisterContextSP reg_ctx_sp; @@ -96,32 +124,35 @@ ScriptedFrame::Create(ScriptedThread &thread, std::shared_ptr reg_ctx_memory = std::make_shared( - thread, frame_id, *register_info_sp, LLDB_INVALID_ADDRESS); + *thread_sp, frame_id, *register_info_sp, LLDB_INVALID_ADDRESS); if (!reg_ctx_memory) - return llvm::createStringError("failed to create a register context."); + return llvm::createStringError("failed to create a register context"); reg_ctx_memory->SetAllRegisterData(data_sp); reg_ctx_sp = reg_ctx_memory; } return std::make_shared( - thread, scripted_frame_interface, frame_id, pc, sc, reg_ctx_sp, + thread_sp, scripted_frame_interface, frame_id, pc, sc, reg_ctx_sp, register_info_sp, owned_script_object_sp); } -ScriptedFrame::ScriptedFrame(ScriptedThread &thread, +ScriptedFrame::ScriptedFrame(ThreadSP thread_sp, ScriptedFrameInterfaceSP interface_sp, lldb::user_id_t id, lldb::addr_t pc, SymbolContext &sym_ctx, lldb::RegisterContextSP reg_ctx_sp, std::shared_ptr reg_info_sp, StructuredData::GenericSP script_object_sp) - : StackFrame(thread.shared_from_this(), /*frame_idx=*/id, + : StackFrame(thread_sp, /*frame_idx=*/id, /*concrete_frame_idx=*/id, /*reg_context_sp=*/reg_ctx_sp, /*cfa=*/0, /*pc=*/pc, /*behaves_like_zeroth_frame=*/!id, /*symbol_ctx=*/&sym_ctx), m_scripted_frame_interface_sp(interface_sp), - m_script_object_sp(script_object_sp), m_register_info_sp(reg_info_sp) {} + m_script_object_sp(script_object_sp), m_register_info_sp(reg_info_sp) { + // FIXME: This should be part of the base class constructor. + m_stack_frame_kind = StackFrame::Kind::Synthetic; +} ScriptedFrame::~ScriptedFrame() {} @@ -164,7 +195,7 @@ std::shared_ptr ScriptedFrame::GetDynamicRegisterInfo() { if (!reg_info) return ScriptedInterface::ErrorWithMessage< std::shared_ptr>( - LLVM_PRETTY_FUNCTION, "Failed to get scripted frame registers info.", + LLVM_PRETTY_FUNCTION, "failed to get scripted frame registers info", error, LLDBLog::Thread); ThreadSP thread_sp = m_thread_wp.lock(); @@ -172,7 +203,7 @@ std::shared_ptr ScriptedFrame::GetDynamicRegisterInfo() { return ScriptedInterface::ErrorWithMessage< std::shared_ptr>( LLVM_PRETTY_FUNCTION, - "Failed to get scripted frame registers info: invalid thread.", error, + "failed to get scripted frame registers info: invalid thread", error, LLDBLog::Thread); ProcessSP process_sp = thread_sp->GetProcess(); @@ -180,8 +211,8 @@ std::shared_ptr ScriptedFrame::GetDynamicRegisterInfo() { return ScriptedInterface::ErrorWithMessage< std::shared_ptr>( LLVM_PRETTY_FUNCTION, - "Failed to get scripted frame registers info: invalid process.", - error, LLDBLog::Thread); + "failed to get scripted frame registers info: invalid process", error, + LLDBLog::Thread); m_register_info_sp = DynamicRegisterInfo::Create( *reg_info, process_sp->GetTarget().GetArchitecture()); diff --git a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h index b6b77c4a7d160..e91e6160bac2f 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedFrame.h +++ b/lldb/source/Plugins/Process/scripted/ScriptedFrame.h @@ -10,21 +10,19 @@ #define LLDB_SOURCE_PLUGINS_SCRIPTED_FRAME_H #include "ScriptedThread.h" -#include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Target/DynamicRegisterInfo.h" #include "lldb/Target/StackFrame.h" +#include "lldb/lldb-forward.h" +#include "llvm/Support/Error.h" +#include #include -namespace lldb_private { -class ScriptedThread; -} - namespace lldb_private { class ScriptedFrame : public lldb_private::StackFrame { public: - ScriptedFrame(ScriptedThread &thread, + ScriptedFrame(lldb::ThreadSP thread_sp, lldb::ScriptedFrameInterfaceSP interface_sp, lldb::user_id_t frame_idx, lldb::addr_t pc, SymbolContext &sym_ctx, lldb::RegisterContextSP reg_ctx_sp, @@ -33,8 +31,29 @@ class ScriptedFrame : public lldb_private::StackFrame { ~ScriptedFrame() override; + /// Create a ScriptedFrame from a object instanciated in the script + /// interpreter. + /// + /// \param[in] thread_sp + /// The thread this frame belongs to. + /// + /// \param[in] scripted_thread_interface_sp + /// The scripted thread interface (needed for ScriptedThread + /// compatibility). Can be nullptr for frames on real threads. + /// + /// \param[in] args_sp + /// Arguments to pass to the frame creation. + /// + /// \param[in] script_object + /// The optional script object representing this frame. + /// + /// \return + /// An Expected containing the ScriptedFrame shared pointer if successful, + /// otherwise an error. static llvm::Expected> - Create(ScriptedThread &thread, StructuredData::DictionarySP args_sp, + Create(lldb::ThreadSP thread_sp, + lldb::ScriptedThreadInterfaceSP scripted_thread_interface_sp, + StructuredData::DictionarySP args_sp, StructuredData::Generic *script_object = nullptr); bool IsInlined() override; diff --git a/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp b/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp index 491efac5aadef..1dd9c48f56a59 100644 --- a/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp +++ b/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp @@ -210,7 +210,7 @@ bool ScriptedThread::LoadArtificialStackFrames() { SymbolContext sc; symbol_addr.CalculateSymbolContext(&sc); - return std::make_shared(this->shared_from_this(), idx, idx, cfa, + return std::make_shared(shared_from_this(), idx, idx, cfa, cfa_is_valid, pc, StackFrame::Kind::Synthetic, artificial, behaves_like_zeroth_frame, &sc); @@ -231,8 +231,8 @@ bool ScriptedThread::LoadArtificialStackFrames() { return error.ToError(); } - auto frame_or_error = - ScriptedFrame::Create(*this, nullptr, object_sp->GetAsGeneric()); + auto frame_or_error = ScriptedFrame::Create( + shared_from_this(), GetInterface(), nullptr, object_sp->GetAsGeneric()); if (!frame_or_error) { ScriptedInterface::ErrorWithMessage( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.cpp index d43036d6fe544..f6c707b2bd168 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptInterpreterPythonInterfaces.cpp @@ -31,6 +31,7 @@ void ScriptInterpreterPythonInterfaces::Initialize() { ScriptedStopHookPythonInterface::Initialize(); ScriptedBreakpointPythonInterface::Initialize(); ScriptedThreadPlanPythonInterface::Initialize(); + ScriptedFrameProviderPythonInterface::Initialize(); } void ScriptInterpreterPythonInterfaces::Terminate() { @@ -40,6 +41,7 @@ void ScriptInterpreterPythonInterfaces::Terminate() { ScriptedStopHookPythonInterface::Terminate(); ScriptedBreakpointPythonInterface::Terminate(); ScriptedThreadPlanPythonInterface::Terminate(); + ScriptedFrameProviderPythonInterface::Terminate(); } #endif diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp index b866bf332b7b6..3dde5036453f4 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "lldb/Core/PluginManager.h" #include "lldb/Host/Config.h" #include "lldb/Target/Thread.h" #include "lldb/Utility/Log.h" @@ -30,18 +31,45 @@ ScriptedFrameProviderPythonInterface::ScriptedFrameProviderPythonInterface( ScriptInterpreterPythonImpl &interpreter) : ScriptedFrameProviderInterface(), ScriptedPythonInterface(interpreter) {} +bool ScriptedFrameProviderPythonInterface::AppliesToThread( + llvm::StringRef class_name, lldb::ThreadSP thread_sp) { + // If there is any issue with this method, we will just assume it also applies + // to this thread which is the default behavior. + constexpr bool fail_value = true; + Status error; + StructuredData::ObjectSP obj = + CallStaticMethod(class_name, "applies_to_thread", error, thread_sp); + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) + return fail_value; + + return obj->GetBooleanValue(fail_value); +} + llvm::Expected ScriptedFrameProviderPythonInterface::CreatePluginObject( const llvm::StringRef class_name, lldb::StackFrameListSP input_frames, StructuredData::DictionarySP args_sp) { if (!input_frames) - return llvm::createStringError("Invalid frame list"); + return llvm::createStringError("invalid frame list"); StructuredDataImpl sd_impl(args_sp); return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr, input_frames, sd_impl); } +std::string ScriptedFrameProviderPythonInterface::GetDescription( + llvm::StringRef class_name) { + Status error; + StructuredData::ObjectSP obj = + CallStaticMethod(class_name, "get_description", error); + if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, + error)) + return {}; + + return obj->GetStringValue().str(); +} + StructuredData::ObjectSP ScriptedFrameProviderPythonInterface::GetFrameAtIndex(uint32_t index) { Status error; @@ -54,4 +82,32 @@ ScriptedFrameProviderPythonInterface::GetFrameAtIndex(uint32_t index) { return obj; } +bool ScriptedFrameProviderPythonInterface::CreateInstance( + lldb::ScriptLanguage language, ScriptedInterfaceUsages usages) { + if (language != eScriptLanguagePython) + return false; + + return true; +} + +void ScriptedFrameProviderPythonInterface::Initialize() { + const std::vector ci_usages = { + "target frame-provider register -C [-k key -v value ...]", + "target frame-provider list", + "target frame-provider remove ", + "target frame-provider clear"}; + const std::vector api_usages = { + "SBTarget.RegisterScriptedFrameProvider", + "SBTarget.RemoveScriptedFrameProvider", + "SBTarget.ClearScriptedFrameProvider"}; + PluginManager::RegisterPlugin( + GetPluginNameStatic(), + llvm::StringRef("Provide scripted stack frames for threads"), + CreateInstance, eScriptLanguagePython, {ci_usages, api_usages}); +} + +void ScriptedFrameProviderPythonInterface::Terminate() { + PluginManager::UnregisterPlugin(CreateInstance); +} + #endif diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h index fd163984028d3..97a5cc7c669ea 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedFrameProviderPythonInterface.h @@ -14,17 +14,22 @@ #if LLDB_ENABLE_PYTHON #include "ScriptedPythonInterface.h" +#include "lldb/Core/PluginInterface.h" #include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include namespace lldb_private { class ScriptedFrameProviderPythonInterface : public ScriptedFrameProviderInterface, - public ScriptedPythonInterface { + public ScriptedPythonInterface, + public PluginInterface { public: ScriptedFrameProviderPythonInterface( ScriptInterpreterPythonImpl &interpreter); + bool AppliesToThread(llvm::StringRef class_name, + lldb::ThreadSP thread_sp) override; + llvm::Expected CreatePluginObject(llvm::StringRef class_name, lldb::StackFrameListSP input_frames, @@ -33,10 +38,24 @@ class ScriptedFrameProviderPythonInterface llvm::SmallVector GetAbstractMethodRequirements() const override { return llvm::SmallVector( - {{"get_frame_at_index"}}); + {{"get_description"}, {"get_frame_at_index"}}); } + std::string GetDescription(llvm::StringRef class_name) override; + StructuredData::ObjectSP GetFrameAtIndex(uint32_t index) override; + + static void Initialize(); + static void Terminate(); + + static bool CreateInstance(lldb::ScriptLanguage language, + ScriptedInterfaceUsages usages); + + static llvm::StringRef GetPluginNameStatic() { + return "ScriptedFrameProviderPythonInterface"; + } + + llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); } }; } // namespace lldb_private diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp index af2e0b5df4d22..ba4473cf9ec4d 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp @@ -93,6 +93,19 @@ ScriptedPythonInterface::ExtractValueFromPythonObject( return nullptr; } +template <> +lldb::ThreadSP +ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error) { + if (lldb::SBThread *sb_thread = reinterpret_cast( + python::LLDBSWIGPython_CastPyObjectToSBThread(p.get()))) + return m_interpreter.GetOpaqueTypeFromSBThread(*sb_thread); + error = Status::FromErrorString( + "Couldn't cast lldb::SBThread to lldb_private::Thread."); + + return nullptr; +} + template <> SymbolContext ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h index ec1dd9910d8a6..c460f58b4e721 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h @@ -188,8 +188,13 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { // This addresses the cases where the embedded interpreter session // dictionary is passed to the extension initializer which is not used // most of the time. + // Note, though none of our API's suggest defining the interfaces with + // varargs, we have some extant clients that were doing that. To keep + // from breaking them, we just say putting a varargs in these signatures + // turns off argument checking. size_t num_args = sizeof...(Args); - if (num_args != arg_info->max_positional_args) { + if (arg_info->max_positional_args != PythonCallable::ArgInfo::UNBOUNDED && + num_args != arg_info->max_positional_args) { if (num_args != arg_info->max_positional_args - 1) return create_error("Passed arguments ({0}) doesn't match the number " "of expected arguments ({1}).", @@ -325,6 +330,112 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { return m_object_instance_sp; } + /// Call a static method on a Python class without creating an instance. + /// + /// This method resolves a Python class by name and calls a static method + /// on it, returning the result. This is useful for calling class-level + /// methods that don't require an instance. + /// + /// \param class_name The fully-qualified name of the Python class. + /// \param method_name The name of the static method to call. + /// \param error Output parameter to receive error information if the call + /// fails. + /// \param args Arguments to pass to the static method. + /// + /// \return The return value of the static method call, or an error value. + template + T CallStaticMethod(llvm::StringRef class_name, llvm::StringRef method_name, + Status &error, Args &&...args) { + using namespace python; + using Locker = ScriptInterpreterPythonImpl::Locker; + + std::string caller_signature = + llvm::Twine(LLVM_PRETTY_FUNCTION + llvm::Twine(" (") + + llvm::Twine(class_name) + llvm::Twine(".") + + llvm::Twine(method_name) + llvm::Twine(")")) + .str(); + + if (class_name.empty()) + return ErrorWithMessage(caller_signature, "missing script class name", + error); + + Locker py_lock(&m_interpreter, Locker::AcquireLock | Locker::NoSTDIN, + Locker::FreeLock); + + // Get the interpreter dictionary. + auto dict = + PythonModule::MainModule().ResolveName( + m_interpreter.GetDictionaryName()); + if (!dict.IsAllocated()) + return ErrorWithMessage( + caller_signature, + llvm::formatv("could not find interpreter dictionary: {0}", + m_interpreter.GetDictionaryName()) + .str(), + error); + + // Resolve the class. + auto class_obj = + PythonObject::ResolveNameWithDictionary( + class_name, dict); + if (!class_obj.IsAllocated()) + return ErrorWithMessage( + caller_signature, + llvm::formatv("could not find script class: {0}", class_name).str(), + error); + + // Get the static method from the class. + if (!class_obj.HasAttribute(method_name)) + return ErrorWithMessage( + caller_signature, + llvm::formatv("class {0} does not have method {1}", class_name, + method_name) + .str(), + error); + + PythonCallable method = + class_obj.GetAttributeValue(method_name).AsType(); + if (!method.IsAllocated()) + return ErrorWithMessage(caller_signature, + llvm::formatv("method {0}.{1} is not callable", + class_name, method_name) + .str(), + error); + + // Transform the arguments. + std::tuple original_args = std::forward_as_tuple(args...); + auto transformed_args = TransformArgs(original_args); + + // Call the static method. + llvm::Expected expected_return_object = + llvm::make_error("Not initialized.", + llvm::inconvertibleErrorCode()); + std::apply( + [&method, &expected_return_object](auto &&...args) { + llvm::consumeError(expected_return_object.takeError()); + expected_return_object = method(args...); + }, + transformed_args); + + if (llvm::Error e = expected_return_object.takeError()) { + error = Status::FromError(std::move(e)); + return ErrorWithMessage( + caller_signature, "python static method could not be called", error); + } + + PythonObject py_return = std::move(expected_return_object.get()); + + // Re-assign reference and pointer arguments if needed. + if (sizeof...(Args) > 0) + if (!ReassignPtrsOrRefsArgs(original_args, transformed_args)) + return ErrorWithMessage( + caller_signature, + "couldn't re-assign reference and pointer arguments", error); + + // Extract value from Python object (handles unallocated case). + return ExtractValueFromPythonObject(py_return, error); + } + protected: template T ExtractValueFromPythonObject(python::PythonObject &p, Status &error) { @@ -341,7 +452,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { llvm::Twine(method_name) + llvm::Twine(")")) .str(); if (!m_object_instance_sp) - return ErrorWithMessage(caller_signature, "Python object ill-formed", + return ErrorWithMessage(caller_signature, "python object ill-formed", error); Locker py_lock(&m_interpreter, Locker::AcquireLock | Locker::NoSTDIN, @@ -353,7 +464,7 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { if (!implementor.IsAllocated()) return llvm::is_contained(GetAbstractMethods(), method_name) ? ErrorWithMessage(caller_signature, - "Python implementor not allocated.", + "python implementor not allocated", error) : T{}; @@ -374,20 +485,20 @@ class ScriptedPythonInterface : virtual public ScriptedInterface { if (llvm::Error e = expected_return_object.takeError()) { error = Status::FromError(std::move(e)); return ErrorWithMessage(caller_signature, - "Python method could not be called.", error); + "python method could not be called", error); } PythonObject py_return = std::move(expected_return_object.get()); // Now that we called the python method with the transformed arguments, - // we need to interate again over both the original and transformed + // we need to iterate again over both the original and transformed // parameter pack, and transform back the parameter that were passed in // the original parameter pack as references or pointers. if (sizeof...(Args) > 0) if (!ReassignPtrsOrRefsArgs(original_args, transformed_args)) return ErrorWithMessage( caller_signature, - "Couldn't re-assign reference and pointer arguments.", error); + "couldn't re-assign reference and pointer arguments", error); if (!py_return.IsAllocated()) return {}; @@ -593,6 +704,11 @@ lldb::StreamSP ScriptedPythonInterface::ExtractValueFromPythonObject( python::PythonObject &p, Status &error); +template <> +lldb::ThreadSP +ScriptedPythonInterface::ExtractValueFromPythonObject( + python::PythonObject &p, Status &error); + template <> lldb::StackFrameSP ScriptedPythonInterface::ExtractValueFromPythonObject( diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index 2c971262fc34e..32948ffd30023 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -265,6 +265,7 @@ void *LLDBSWIGPython_CastPyObjectToSBLaunchInfo(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBError(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data); +void *LLDBSWIGPython_CastPyObjectToSBThread(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBFrame(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBSymbolContext(PyObject *data); void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data); diff --git a/lldb/source/Plugins/SyntheticFrameProvider/CMakeLists.txt b/lldb/source/Plugins/SyntheticFrameProvider/CMakeLists.txt new file mode 100644 index 0000000000000..85b405e648c1f --- /dev/null +++ b/lldb/source/Plugins/SyntheticFrameProvider/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(ScriptedFrameProvider) diff --git a/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/CMakeLists.txt b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/CMakeLists.txt new file mode 100644 index 0000000000000..fe67d39efdf11 --- /dev/null +++ b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/CMakeLists.txt @@ -0,0 +1,12 @@ +add_lldb_library(lldbPluginScriptedFrameProvider PLUGIN + ScriptedFrameProvider.cpp + + LINK_COMPONENTS + Support + + LINK_LIBS + lldbCore + lldbInterpreter + lldbTarget + lldbUtility + ) diff --git a/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.cpp b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.cpp new file mode 100644 index 0000000000000..17d0e925fadc6 --- /dev/null +++ b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.cpp @@ -0,0 +1,215 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ScriptedFrameProvider.h" +#include "Plugins/Process/scripted/ScriptedFrame.h" +#include "lldb/Core/Debugger.h" +#include "lldb/Core/PluginManager.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" +#include "lldb/Interpreter/ScriptInterpreter.h" +#include "lldb/Target/Process.h" +#include "lldb/Target/StackFrame.h" +#include "lldb/Target/Thread.h" +#include "lldb/Utility/ScriptedMetadata.h" +#include "lldb/Utility/Status.h" +#include "llvm/Support/Error.h" +#include + +using namespace lldb; +using namespace lldb_private; + +void ScriptedFrameProvider::Initialize() { + PluginManager::RegisterPlugin(GetPluginNameStatic(), + "Provides synthetic frames via scripting", + nullptr, ScriptedFrameProvider::CreateInstance); +} + +void ScriptedFrameProvider::Terminate() { + PluginManager::UnregisterPlugin(ScriptedFrameProvider::CreateInstance); +} + +llvm::Expected +ScriptedFrameProvider::CreateInstance( + lldb::StackFrameListSP input_frames, + const ScriptedFrameProviderDescriptor &descriptor) { + if (!input_frames) + return llvm::createStringError( + "failed to create scripted frame provider: invalid input frames"); + + Thread &thread = input_frames->GetThread(); + ProcessSP process_sp = thread.GetProcess(); + if (!process_sp) + return nullptr; + + if (!descriptor.IsValid()) + return llvm::createStringError( + "failed to create scripted frame provider: invalid scripted metadata"); + + if (!descriptor.AppliesToThread(thread)) + return nullptr; + + ScriptInterpreter *script_interp = + process_sp->GetTarget().GetDebugger().GetScriptInterpreter(); + if (!script_interp) + return llvm::createStringError("cannot create scripted frame provider: No " + "script interpreter installed"); + + ScriptedFrameProviderInterfaceSP interface_sp = + script_interp->CreateScriptedFrameProviderInterface(); + if (!interface_sp) + return llvm::createStringError( + "cannot create scripted frame provider: script interpreter couldn't " + "create Scripted Frame Provider Interface"); + + const ScriptedMetadataSP scripted_metadata = descriptor.scripted_metadata_sp; + + // If we shouldn't attach a frame provider to this thread, just exit early. + if (!interface_sp->AppliesToThread(scripted_metadata->GetClassName(), + thread.shared_from_this())) + return nullptr; + + auto obj_or_err = interface_sp->CreatePluginObject( + scripted_metadata->GetClassName(), input_frames, + scripted_metadata->GetArgsSP()); + if (!obj_or_err) + return obj_or_err.takeError(); + + StructuredData::ObjectSP object_sp = *obj_or_err; + if (!object_sp || !object_sp->IsValid()) + return llvm::createStringError( + "cannot create scripted frame provider: failed to create valid scripted" + "frame provider object"); + + return std::make_shared(input_frames, interface_sp, + descriptor); +} + +ScriptedFrameProvider::ScriptedFrameProvider( + StackFrameListSP input_frames, + lldb::ScriptedFrameProviderInterfaceSP interface_sp, + const ScriptedFrameProviderDescriptor &descriptor) + : SyntheticFrameProvider(input_frames), m_interface_sp(interface_sp), + m_descriptor(descriptor) {} + +ScriptedFrameProvider::~ScriptedFrameProvider() = default; + +std::string ScriptedFrameProvider::GetDescription() const { + if (!m_interface_sp) + return {}; + + return m_interface_sp->GetDescription(m_descriptor.GetName()); +} + +llvm::Expected +ScriptedFrameProvider::GetFrameAtIndex(uint32_t idx) { + if (!m_interface_sp) + return llvm::createStringError( + "cannot get stack frame: scripted frame provider not initialized"); + + auto create_frame_from_dict = + [this](StructuredData::Dictionary *dict, + uint32_t index) -> llvm::Expected { + lldb::addr_t pc; + if (!dict->GetValueForKeyAsInteger("pc", pc)) + return llvm::createStringError( + "missing 'pc' key from scripted frame dictionary"); + + Address symbol_addr; + symbol_addr.SetLoadAddress(pc, &GetThread().GetProcess()->GetTarget()); + + const lldb::addr_t cfa = LLDB_INVALID_ADDRESS; + const bool cfa_is_valid = false; + const bool artificial = false; + const bool behaves_like_zeroth_frame = false; + SymbolContext sc; + symbol_addr.CalculateSymbolContext(&sc); + + ThreadSP thread_sp = GetThread().shared_from_this(); + return std::make_shared(thread_sp, index, index, cfa, + cfa_is_valid, pc, + StackFrame::Kind::Synthetic, artificial, + behaves_like_zeroth_frame, &sc); + }; + + auto create_frame_from_script_object = + [this]( + StructuredData::ObjectSP object_sp) -> llvm::Expected { + Status error; + if (!object_sp || !object_sp->GetAsGeneric()) + return llvm::createStringError("invalid script object"); + + ThreadSP thread_sp = GetThread().shared_from_this(); + auto frame_or_error = ScriptedFrame::Create(thread_sp, nullptr, nullptr, + object_sp->GetAsGeneric()); + + if (!frame_or_error) { + ScriptedInterface::ErrorWithMessage( + LLVM_PRETTY_FUNCTION, toString(frame_or_error.takeError()), error); + return error.ToError(); + } + + return *frame_or_error; + }; + + StructuredData::ObjectSP obj_sp = m_interface_sp->GetFrameAtIndex(idx); + + // None/null means no more frames or error. + if (!obj_sp || !obj_sp->IsValid()) + return llvm::createStringError("invalid script object returned for frame " + + llvm::Twine(idx)); + + StackFrameSP synth_frame_sp = nullptr; + if (StructuredData::UnsignedInteger *int_obj = + obj_sp->GetAsUnsignedInteger()) { + uint32_t real_frame_index = int_obj->GetValue(); + if (real_frame_index < m_input_frames->GetNumFrames()) { + synth_frame_sp = m_input_frames->GetFrameAtIndex(real_frame_index); + } + } else if (StructuredData::Dictionary *dict = obj_sp->GetAsDictionary()) { + // Check if it's a dictionary describing a frame. + auto frame_from_dict_or_err = create_frame_from_dict(dict, idx); + if (!frame_from_dict_or_err) { + return llvm::createStringError(llvm::Twine( + "couldn't create frame from dictionary at index " + llvm::Twine(idx) + + ": " + toString(frame_from_dict_or_err.takeError()))); + } + synth_frame_sp = *frame_from_dict_or_err; + } else if (obj_sp->GetAsGeneric()) { + // It's a ScriptedFrame object. + auto frame_from_script_obj_or_err = create_frame_from_script_object(obj_sp); + if (!frame_from_script_obj_or_err) { + return llvm::createStringError( + llvm::Twine("couldn't create frame from script object at index " + + llvm::Twine(idx) + ": " + + toString(frame_from_script_obj_or_err.takeError()))); + } + synth_frame_sp = *frame_from_script_obj_or_err; + } else { + return llvm::createStringError( + llvm::Twine("invalid return type from get_frame_at_index at index " + + llvm::Twine(idx))); + } + + if (!synth_frame_sp) + return llvm::createStringError( + llvm::Twine("failed to create frame at index " + llvm::Twine(idx))); + + synth_frame_sp->SetFrameIndex(idx); + + return synth_frame_sp; +} + +namespace lldb_private { +void lldb_initialize_ScriptedFrameProvider() { + ScriptedFrameProvider::Initialize(); +} + +void lldb_terminate_ScriptedFrameProvider() { + ScriptedFrameProvider::Terminate(); +} +} // namespace lldb_private diff --git a/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.h b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.h new file mode 100644 index 0000000000000..3434bf26ade24 --- /dev/null +++ b/lldb/source/Plugins/SyntheticFrameProvider/ScriptedFrameProvider/ScriptedFrameProvider.h @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_PLUGINS_SYNTHETICFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_H +#define LLDB_PLUGINS_SYNTHETICFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_H + +#include "lldb/Target/SyntheticFrameProvider.h" +#include "lldb/Utility/ScriptedMetadata.h" +#include "lldb/Utility/Status.h" +#include "lldb/lldb-forward.h" +#include "llvm/Support/Error.h" + +namespace lldb_private { + +class ScriptedFrameProvider : public SyntheticFrameProvider { +public: + static llvm::StringRef GetPluginNameStatic() { + return "ScriptedFrameProvider"; + } + + static llvm::Expected + CreateInstance(lldb::StackFrameListSP input_frames, + const ScriptedFrameProviderDescriptor &descriptor); + + static void Initialize(); + + static void Terminate(); + + ScriptedFrameProvider(lldb::StackFrameListSP input_frames, + lldb::ScriptedFrameProviderInterfaceSP interface_sp, + const ScriptedFrameProviderDescriptor &descriptor); + ~ScriptedFrameProvider() override; + + llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); } + + std::string GetDescription() const override; + + /// Get a single stack frame at the specified index. + llvm::Expected GetFrameAtIndex(uint32_t idx) override; + +private: + lldb::ScriptedFrameProviderInterfaceSP m_interface_sp; + const ScriptedFrameProviderDescriptor &m_descriptor; +}; + +} // namespace lldb_private + +#endif // LLDB_PLUGINS_SYNTHETICFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_SCRIPTEDFRAMEPROVIDER_H diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp index ccf874fc03ebd..a661500ec862b 100644 --- a/lldb/source/Target/StackFrameList.cpp +++ b/lldb/source/Target/StackFrameList.cpp @@ -20,6 +20,7 @@ #include "lldb/Target/StackFrame.h" #include "lldb/Target/StackFrameRecognizer.h" #include "lldb/Target/StopInfo.h" +#include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Target/Target.h" #include "lldb/Target/Thread.h" #include "lldb/Target/Unwind.h" @@ -55,6 +56,40 @@ StackFrameList::~StackFrameList() { Clear(); } +SyntheticStackFrameList::SyntheticStackFrameList( + Thread &thread, lldb::StackFrameListSP input_frames, + const lldb::StackFrameListSP &prev_frames_sp, bool show_inline_frames) + : StackFrameList(thread, prev_frames_sp, show_inline_frames), + m_input_frames(std::move(input_frames)) {} + +bool SyntheticStackFrameList::FetchFramesUpTo( + uint32_t end_idx, InterruptionControl allow_interrupt) { + // Check if the thread has a synthetic frame provider. + if (auto provider_sp = m_thread.GetFrameProvider()) { + // Use the synthetic frame provider to generate frames lazily. + // Keep fetching until we reach end_idx or the provider returns an error. + for (uint32_t idx = m_frames.size(); idx <= end_idx; idx++) { + if (allow_interrupt && + m_thread.GetProcess()->GetTarget().GetDebugger().InterruptRequested()) + return true; + auto frame_or_err = provider_sp->GetFrameAtIndex(idx); + if (!frame_or_err) { + // Provider returned error - we've reached the end. + LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), frame_or_err.takeError(), + "Frame provider reached end at index {0}: {1}", idx); + SetAllFramesFetched(); + break; + } + m_frames.push_back(*frame_or_err); + } + + return false; // Not interrupted. + } + + // If no provider, fall back to the base implementation. + return StackFrameList::FetchFramesUpTo(end_idx, allow_interrupt); +} + void StackFrameList::CalculateCurrentInlinedDepth() { uint32_t cur_inlined_depth = GetCurrentInlinedDepth(); if (cur_inlined_depth == UINT32_MAX) { diff --git a/lldb/source/Target/SyntheticFrameProvider.cpp b/lldb/source/Target/SyntheticFrameProvider.cpp index 241ce82c39be3..97ff42d1ed53e 100644 --- a/lldb/source/Target/SyntheticFrameProvider.cpp +++ b/lldb/source/Target/SyntheticFrameProvider.cpp @@ -8,10 +8,12 @@ #include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Core/PluginManager.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include "lldb/Target/Thread.h" #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/Status.h" +#include "lldb/Utility/Stream.h" using namespace lldb; using namespace lldb_private; @@ -21,12 +23,17 @@ SyntheticFrameProvider::SyntheticFrameProvider(StackFrameListSP input_frames) SyntheticFrameProvider::~SyntheticFrameProvider() = default; -void SyntheticFrameProviderDescriptor::Dump(Stream *s) const { +void ScriptedFrameProviderDescriptor::Dump(Stream *s) const { if (!s) return; + s->Format(" ID: {0:x}\n", GetID()); s->Printf(" Name: %s\n", GetName().str().c_str()); + std::string description = GetDescription(); + if (!description.empty()) + s->Printf(" Description: %s\n", description.c_str()); + // Show thread filter information. if (thread_specs.empty()) { s->PutCString(" Thread Filter: (applies to all threads)\n"); @@ -41,9 +48,23 @@ void SyntheticFrameProviderDescriptor::Dump(Stream *s) const { } } +uint32_t ScriptedFrameProviderDescriptor::GetID() const { + if (!scripted_metadata_sp) + return 0; + + return scripted_metadata_sp->GetID(); +} + +std::string ScriptedFrameProviderDescriptor::GetDescription() const { + // If we have an interface, call get_description() to fetch it. + if (interface_sp && scripted_metadata_sp) + return interface_sp->GetDescription(scripted_metadata_sp->GetClassName()); + return {}; +} + llvm::Expected SyntheticFrameProvider::CreateInstance( StackFrameListSP input_frames, - const SyntheticFrameProviderDescriptor &descriptor) { + const ScriptedFrameProviderDescriptor &descriptor) { if (!input_frames) return llvm::createStringError( "cannot create synthetic frame provider: invalid input frames"); diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 3b51e17d1c4e0..3f182bc61392b 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -3718,6 +3718,61 @@ Status Target::Attach(ProcessAttachInfo &attach_info, Stream *stream) { return error; } +llvm::Expected Target::AddScriptedFrameProviderDescriptor( + const ScriptedFrameProviderDescriptor &descriptor) { + if (!descriptor.IsValid()) + return llvm::createStringError("invalid frame provider descriptor"); + + llvm::StringRef name = descriptor.GetName(); + if (name.empty()) + return llvm::createStringError( + "frame provider descriptor has no class name"); + + std::lock_guard guard( + m_frame_provider_descriptors_mutex); + + uint32_t descriptor_id = descriptor.GetID(); + m_frame_provider_descriptors[descriptor_id] = descriptor; + + // Clear frame providers on existing threads so they reload with new config. + if (ProcessSP process_sp = GetProcessSP()) + for (ThreadSP thread_sp : process_sp->Threads()) + thread_sp->ClearScriptedFrameProvider(); + + return descriptor_id; +} + +bool Target::RemoveScriptedFrameProviderDescriptor(uint32_t id) { + std::lock_guard guard( + m_frame_provider_descriptors_mutex); + bool removed = m_frame_provider_descriptors.erase(id); + + if (removed) + if (ProcessSP process_sp = GetProcessSP()) + for (ThreadSP thread_sp : process_sp->Threads()) + thread_sp->ClearScriptedFrameProvider(); + + return removed; +} + +void Target::ClearScriptedFrameProviderDescriptors() { + std::lock_guard guard( + m_frame_provider_descriptors_mutex); + + m_frame_provider_descriptors.clear(); + + if (ProcessSP process_sp = GetProcessSP()) + for (ThreadSP thread_sp : process_sp->Threads()) + thread_sp->ClearScriptedFrameProvider(); +} + +const llvm::DenseMap & +Target::GetScriptedFrameProviderDescriptors() const { + std::lock_guard guard( + m_frame_provider_descriptors_mutex); + return m_frame_provider_descriptors; +} + void Target::FinalizeFileActions(ProcessLaunchInfo &info) { Log *log = GetLog(LLDBLog::Process); diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index 8c3e19725f8cb..b40e753aca1e9 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -13,9 +13,12 @@ #include "lldb/Core/Module.h" #include "lldb/Core/StructuredDataImpl.h" #include "lldb/Host/Host.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameInterface.h" +#include "lldb/Interpreter/Interfaces/ScriptedFrameProviderInterface.h" #include "lldb/Interpreter/OptionValueFileSpecList.h" #include "lldb/Interpreter/OptionValueProperties.h" #include "lldb/Interpreter/Property.h" +#include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Symbol/Function.h" #include "lldb/Target/ABI.h" #include "lldb/Target/DynamicLoader.h" @@ -26,6 +29,7 @@ #include "lldb/Target/ScriptedThreadPlan.h" #include "lldb/Target/StackFrameRecognizer.h" #include "lldb/Target/StopInfo.h" +#include "lldb/Target/SyntheticFrameProvider.h" #include "lldb/Target/SystemRuntime.h" #include "lldb/Target/Target.h" #include "lldb/Target/ThreadPlan.h" @@ -45,6 +49,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/RegularExpression.h" +#include "lldb/Utility/ScriptedMetadata.h" #include "lldb/Utility/State.h" #include "lldb/Utility/Stream.h" #include "lldb/Utility/StreamString.h" @@ -257,6 +262,7 @@ void Thread::DestroyThread() { std::lock_guard guard(m_frame_mutex); m_curr_frames_sp.reset(); m_prev_frames_sp.reset(); + m_frame_provider_sp.reset(); m_prev_framezero_pc.reset(); } @@ -1439,13 +1445,76 @@ void Thread::CalculateExecutionContext(ExecutionContext &exe_ctx) { StackFrameListSP Thread::GetStackFrameList() { std::lock_guard guard(m_frame_mutex); - if (!m_curr_frames_sp) + if (m_curr_frames_sp) + return m_curr_frames_sp; + + // First, try to load a frame provider if we don't have one yet. + if (!m_frame_provider_sp) { + ProcessSP process_sp = GetProcess(); + if (process_sp) { + Target &target = process_sp->GetTarget(); + const auto &descriptors = target.GetScriptedFrameProviderDescriptors(); + + // Find first descriptor that applies to this thread. + for (const auto &entry : descriptors) { + const ScriptedFrameProviderDescriptor &descriptor = entry.second; + if (descriptor.IsValid() && descriptor.AppliesToThread(*this)) { + if (llvm::Error error = LoadScriptedFrameProvider(descriptor)) { + LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), std::move(error), + "Failed to load scripted frame provider: {0}"); + } + break; // Use first matching descriptor (success or failure). + } + } + } + } + + // Create the frame list based on whether we have a provider. + if (m_frame_provider_sp) { + // We have a provider - create synthetic frame list. + StackFrameListSP input_frames = m_frame_provider_sp->GetInputFrames(); + m_curr_frames_sp = std::make_shared( + *this, input_frames, m_prev_frames_sp, true); + } else { + // No provider - use normal unwinder frames. m_curr_frames_sp = std::make_shared(*this, m_prev_frames_sp, true); + } return m_curr_frames_sp; } +llvm::Error Thread::LoadScriptedFrameProvider( + const ScriptedFrameProviderDescriptor &descriptor) { + std::lock_guard guard(m_frame_mutex); + + // Note: We don't create input_frames here - it will be created lazily + // by SyntheticStackFrameList when frames are first fetched. + // Creating them too early can cause crashes during thread initialization. + + // Create a temporary StackFrameList just to get the thread reference for the + // provider. The provider won't actually use this - it will get real input + // frames from SyntheticStackFrameList later. + StackFrameListSP temp_frames = + std::make_shared(*this, m_prev_frames_sp, true); + + auto provider_or_err = + SyntheticFrameProvider::CreateInstance(temp_frames, descriptor); + if (!provider_or_err) + return provider_or_err.takeError(); + + ClearScriptedFrameProvider(); + m_frame_provider_sp = *provider_or_err; + return llvm::Error::success(); +} + +void Thread::ClearScriptedFrameProvider() { + std::lock_guard guard(m_frame_mutex); + m_frame_provider_sp.reset(); + m_curr_frames_sp.reset(); + m_prev_frames_sp.reset(); +} + std::optional Thread::GetPreviousFrameZeroPC() { return m_prev_framezero_pc; } @@ -1466,6 +1535,7 @@ void Thread::ClearStackFrames() { m_prev_frames_sp.swap(m_curr_frames_sp); m_curr_frames_sp.reset(); + m_frame_provider_sp.reset(); m_extended_info.reset(); m_extended_info_fetched = false; } diff --git a/lldb/source/Target/ThreadSpec.cpp b/lldb/source/Target/ThreadSpec.cpp index ba4c3aa894553..624f64e3af800 100644 --- a/lldb/source/Target/ThreadSpec.cpp +++ b/lldb/source/Target/ThreadSpec.cpp @@ -19,6 +19,10 @@ const char *ThreadSpec::g_option_names[static_cast( ThreadSpec::ThreadSpec() : m_name(), m_queue_name() {} +ThreadSpec::ThreadSpec(Thread &thread) + : m_index(thread.GetIndexID()), m_tid(thread.GetID()), + m_name(thread.GetName()), m_queue_name(thread.GetQueueName()) {} + std::unique_ptr ThreadSpec::CreateFromStructuredData( const StructuredData::Dictionary &spec_dict, Status &error) { uint32_t index = UINT32_MAX; diff --git a/lldb/test/API/functionalities/scripted_frame_provider/Makefile b/lldb/test/API/functionalities/scripted_frame_provider/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/functionalities/scripted_frame_provider/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/functionalities/scripted_frame_provider/TestScriptedFrameProvider.py b/lldb/test/API/functionalities/scripted_frame_provider/TestScriptedFrameProvider.py new file mode 100644 index 0000000000000..189ca2f147f9d --- /dev/null +++ b/lldb/test/API/functionalities/scripted_frame_provider/TestScriptedFrameProvider.py @@ -0,0 +1,339 @@ +""" +Test scripted frame provider functionality. +""" + +import os + +import lldb +from lldbsuite.test.lldbtest import TestBase +from lldbsuite.test import lldbutil + + +class ScriptedFrameProviderTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def setUp(self): + TestBase.setUp(self) + self.source = "main.cpp" + + def test_replace_all_frames(self): + """Test that we can replace the entire stack.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Import the test frame provider + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + # Attach the Replace provider + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.ReplaceFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Verify we have exactly 3 synthetic frames + self.assertEqual(thread.GetNumFrames(), 3, "Should have 3 synthetic frames") + + # Verify frame indices and PCs (dictionary-based frames don't have custom function names) + frame0 = thread.GetFrameAtIndex(0) + self.assertIsNotNone(frame0) + self.assertEqual(frame0.GetPC(), 0x1000) + + frame1 = thread.GetFrameAtIndex(1) + self.assertIsNotNone(frame1) + self.assertIn("thread_func", frame1.GetFunctionName()) + + frame2 = thread.GetFrameAtIndex(2) + self.assertIsNotNone(frame2) + self.assertEqual(frame2.GetPC(), 0x3000) + + def test_prepend_frames(self): + """Test that we can add frames before real stack.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Get original frame count and PC + original_frame_count = thread.GetNumFrames() + self.assertGreaterEqual( + original_frame_count, 2, "Should have at least 2 real frames" + ) + + # Import and attach Prepend provider + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.PrependFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Verify we have 2 more frames + new_frame_count = thread.GetNumFrames() + self.assertEqual(new_frame_count, original_frame_count + 2) + + # Verify first 2 frames are synthetic (check PCs, not function names) + frame0 = thread.GetFrameAtIndex(0) + self.assertEqual(frame0.GetPC(), 0x9000) + + frame1 = thread.GetFrameAtIndex(1) + self.assertEqual(frame1.GetPC(), 0xA000) + + # Verify frame 2 is the original real frame 0 + frame2 = thread.GetFrameAtIndex(2) + self.assertIn("thread_func", frame2.GetFunctionName()) + + def test_append_frames(self): + """Test that we can add frames after real stack.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Get original frame count + original_frame_count = thread.GetNumFrames() + + # Import and attach Append provider + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.AppendFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Verify we have 1 more frame + new_frame_count = thread.GetNumFrames() + self.assertEqual(new_frame_count, original_frame_count + 1) + + # Verify first frames are still real + frame0 = thread.GetFrameAtIndex(0) + self.assertIn("thread_func", frame0.GetFunctionName()) + + frame_n_plus_1 = thread.GetFrameAtIndex(new_frame_count - 1) + self.assertEqual(frame_n_plus_1.GetPC(), 0x10) + + def test_scripted_frame_objects(self): + """Test that provider can return ScriptedFrame objects.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Import the provider that returns ScriptedFrame objects + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.ScriptedFrameObjectProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Verify we have 5 frames + self.assertEqual( + thread.GetNumFrames(), 5, "Should have 5 custom scripted frames" + ) + + # Verify frame properties from CustomScriptedFrame + frame0 = thread.GetFrameAtIndex(0) + self.assertIsNotNone(frame0) + self.assertEqual(frame0.GetFunctionName(), "custom_scripted_frame_0") + self.assertEqual(frame0.GetPC(), 0x5000) + self.assertTrue(frame0.IsSynthetic(), "Frame should be marked as synthetic") + + frame1 = thread.GetFrameAtIndex(1) + self.assertIsNotNone(frame1) + self.assertEqual(frame1.GetPC(), 0x6000) + + frame2 = thread.GetFrameAtIndex(2) + self.assertIsNotNone(frame2) + self.assertEqual(frame2.GetFunctionName(), "custom_scripted_frame_2") + self.assertEqual(frame2.GetPC(), 0x7000) + self.assertTrue(frame2.IsSynthetic(), "Frame should be marked as synthetic") + + def test_applies_to_thread(self): + """Test that applies_to_thread filters which threads get the provider.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # We should have at least 2 threads (worker threads) at the breakpoint + num_threads = process.GetNumThreads() + self.assertGreaterEqual( + num_threads, 2, "Should have at least 2 threads at breakpoint" + ) + + # Import the test frame provider + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + # Collect original thread info before applying provider + thread_info = {} + for i in range(num_threads): + t = process.GetThreadAtIndex(i) + thread_info[t.GetIndexID()] = { + "frame_count": t.GetNumFrames(), + "pc": t.GetFrameAtIndex(0).GetPC(), + } + + # Register the ThreadFilterFrameProvider which only applies to thread ID 1 + error = lldb.SBError() + provider_id = target.RegisterScriptedFrameProvider( + "test_frame_providers.ThreadFilterFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider: {error}") + self.assertNotEqual(provider_id, 0, "Provider ID should be non-zero") + + # Check each thread + thread_id_1_found = False + for i in range(num_threads): + t = process.GetThreadAtIndex(i) + thread_id = t.GetIndexID() + + if thread_id == 1: + # Thread with ID 1 should have synthetic frame + thread_id_1_found = True + self.assertEqual( + t.GetNumFrames(), + 1, + f"Thread with ID 1 should have 1 synthetic frame", + ) + self.assertEqual( + t.GetFrameAtIndex(0).GetPC(), + 0xFFFF, + f"Thread with ID 1 should have synthetic PC 0xFFFF", + ) + else: + # Other threads should keep their original frames + self.assertEqual( + t.GetNumFrames(), + thread_info[thread_id]["frame_count"], + f"Thread with ID {thread_id} should not be affected by provider", + ) + self.assertEqual( + t.GetFrameAtIndex(0).GetPC(), + thread_info[thread_id]["pc"], + f"Thread with ID {thread_id} should have its original PC", + ) + + # We should have found at least one thread with ID 1 + self.assertTrue( + thread_id_1_found, + "Should have found a thread with ID 1 to test filtering", + ) + + def test_remove_frame_provider_by_id(self): + """Test that RemoveScriptedFrameProvider removes a specific provider by ID.""" + self.build() + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec(self.source), only_one_thread=False + ) + + # Import the test frame providers + script_path = os.path.join(self.getSourceDir(), "test_frame_providers.py") + self.runCmd("command script import " + script_path) + + # Get original frame count + original_frame_count = thread.GetNumFrames() + original_pc = thread.GetFrameAtIndex(0).GetPC() + + # Register the first provider and get its ID + error = lldb.SBError() + provider_id_1 = target.RegisterScriptedFrameProvider( + "test_frame_providers.ReplaceFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider 1: {error}") + + # Verify first provider is active (3 synthetic frames) + self.assertEqual(thread.GetNumFrames(), 3, "Should have 3 synthetic frames") + self.assertEqual( + thread.GetFrameAtIndex(0).GetPC(), 0x1000, "Should have first provider's PC" + ) + + # Register a second provider and get its ID + provider_id_2 = target.RegisterScriptedFrameProvider( + "test_frame_providers.PrependFrameProvider", + lldb.SBStructuredData(), + error, + ) + self.assertTrue(error.Success(), f"Failed to register provider 2: {error}") + + # Verify IDs are different + self.assertNotEqual( + provider_id_1, provider_id_2, "Provider IDs should be unique" + ) + + # Now remove the first provider by ID + result = target.RemoveScriptedFrameProvider(provider_id_1) + self.assertSuccess( + result, f"Should successfully remove provider with ID {provider_id_1}" + ) + + # After removing the first provider, the second provider should still be active + # The PrependFrameProvider adds 2 frames before the real stack + # Since ReplaceFrameProvider had 3 frames, and we removed it, we should now + # have the original frames (from real stack) with PrependFrameProvider applied + new_frame_count = thread.GetNumFrames() + self.assertEqual( + new_frame_count, + original_frame_count + 2, + "Should have original frames + 2 prepended frames", + ) + + # First two frames should be from PrependFrameProvider + self.assertEqual( + thread.GetFrameAtIndex(0).GetPC(), + 0x9000, + "First frame should be from PrependFrameProvider", + ) + self.assertEqual( + thread.GetFrameAtIndex(1).GetPC(), + 0xA000, + "Second frame should be from PrependFrameProvider", + ) + + # Remove the second provider + result = target.RemoveScriptedFrameProvider(provider_id_2) + self.assertSuccess( + result, f"Should successfully remove provider with ID {provider_id_2}" + ) + + # After removing both providers, frames should be back to original + self.assertEqual( + thread.GetNumFrames(), + original_frame_count, + "Should restore original frame count", + ) + self.assertEqual( + thread.GetFrameAtIndex(0).GetPC(), + original_pc, + "Should restore original PC", + ) + + # Try to remove a provider that doesn't exist + result = target.RemoveScriptedFrameProvider(999999) + self.assertTrue(result.Fail(), "Should fail to remove non-existent provider") diff --git a/lldb/test/API/functionalities/scripted_frame_provider/main.cpp b/lldb/test/API/functionalities/scripted_frame_provider/main.cpp new file mode 100644 index 0000000000000..f15cb282f9d25 --- /dev/null +++ b/lldb/test/API/functionalities/scripted_frame_provider/main.cpp @@ -0,0 +1,55 @@ +// Multi-threaded test program for testing frame providers. + +#include +#include +#include +#include + +std::mutex mtx; +std::condition_variable cv; +int ready_count = 0; +constexpr int NUM_THREADS = 2; + +void thread_func(int thread_num) { + std::cout << "Thread " << thread_num << " started\n"; + + { + std::unique_lock lock(mtx); + ready_count++; + if (ready_count == NUM_THREADS + 1) { + cv.notify_all(); + } else { + cv.wait(lock, [] { return ready_count == NUM_THREADS + 1; }); + } + } + + std::cout << "Thread " << thread_num << " at breakpoint\n"; // Break here +} + +int main(int argc, char **argv) { + std::thread threads[NUM_THREADS]; + + for (int i = 0; i < NUM_THREADS; i++) { + threads[i] = std::thread(thread_func, i); + } + + { + std::unique_lock lock(mtx); + ready_count++; + if (ready_count == NUM_THREADS + 1) { + cv.notify_all(); + } else { + cv.wait(lock, [] { return ready_count == NUM_THREADS + 1; }); + } + } + + std::cout << "Main thread at barrier\n"; + + // Join threads + for (int i = 0; i < NUM_THREADS; i++) { + threads[i].join(); + } + + std::cout << "All threads completed\n"; + return 0; +} diff --git a/lldb/test/API/functionalities/scripted_frame_provider/test_frame_providers.py b/lldb/test/API/functionalities/scripted_frame_provider/test_frame_providers.py new file mode 100644 index 0000000000000..91aa13e44339a --- /dev/null +++ b/lldb/test/API/functionalities/scripted_frame_provider/test_frame_providers.py @@ -0,0 +1,176 @@ +""" +Test frame providers for scripted frame provider functionality. + +These providers demonstrate various merge strategies: +- Replace: Replace entire stack +- Prepend: Add frames before real stack +- Append: Add frames after real stack + +It also shows the ability to mix a dictionary, a ScriptedFrame or an SBFrame +index to create stackframes +""" + +import lldb +from lldb.plugins.scripted_process import ScriptedFrame +from lldb.plugins.scripted_frame_provider import ScriptedFrameProvider + + +class ReplaceFrameProvider(ScriptedFrameProvider): + """Replace entire stack with custom frames.""" + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + self.frames = [ + { + "idx": 0, + "pc": 0x1000, + }, + 0, + { + "idx": 2, + "pc": 0x3000, + }, + ] + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Replace entire stack with 3 custom frames" + + def get_frame_at_index(self, index): + if index >= len(self.frames): + return None + return self.frames[index] + + +class PrependFrameProvider(ScriptedFrameProvider): + """Prepend synthetic frames before real stack.""" + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Prepend 2 synthetic frames before real stack" + + def get_frame_at_index(self, index): + if index == 0: + return {"pc": 0x9000} + elif index == 1: + return {"pc": 0xA000} + elif index - 2 < len(self.input_frames): + return index - 2 # Return real frame index + return None + + +class AppendFrameProvider(ScriptedFrameProvider): + """Append synthetic frames after real stack.""" + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Append 1 synthetic frame after real stack" + + def get_frame_at_index(self, index): + if index < len(self.input_frames): + return index # Return real frame index + elif index == len(self.input_frames): + return { + "idx": 1, + "pc": 0x10, + } + return None + + +class CustomScriptedFrame(ScriptedFrame): + """Custom scripted frame with full control over frame behavior.""" + + def __init__(self, thread, idx, pc, function_name): + # Initialize structured data args + args = lldb.SBStructuredData() + super().__init__(thread, args) + + self.idx = idx + self.pc = pc + self.function_name = function_name + + def get_id(self): + """Return the frame index.""" + return self.idx + + def get_pc(self): + """Return the program counter.""" + return self.pc + + def get_function_name(self): + """Return the function name.""" + return self.function_name + + def is_artificial(self): + """Mark as artificial frame.""" + return False + + def is_hidden(self): + """Not hidden.""" + return False + + def get_register_context(self): + """No register context for this test.""" + return None + + +class ScriptedFrameObjectProvider(ScriptedFrameProvider): + """Provider that returns ScriptedFrame objects instead of dictionaries.""" + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Provider returning custom ScriptedFrame objects" + + def get_frame_at_index(self, index): + """Return ScriptedFrame objects or dictionaries based on index.""" + if index == 0: + return CustomScriptedFrame( + self.thread, 0, 0x5000, "custom_scripted_frame_0" + ) + elif index == 1: + return {"pc": 0x6000} + elif index == 2: + return CustomScriptedFrame( + self.thread, 2, 0x7000, "custom_scripted_frame_2" + ) + elif index == 3: + return len(self.input_frames) - 2 # Real frame index + elif index == 4: + return len(self.input_frames) - 1 # Real frame index + return None + + +class ThreadFilterFrameProvider(ScriptedFrameProvider): + """Provider that only applies to thread with ID 1.""" + + @staticmethod + def applies_to_thread(thread): + """Only apply to thread with index ID 1.""" + return thread.GetIndexID() == 1 + + def __init__(self, input_frames, args): + super().__init__(input_frames, args) + + @staticmethod + def get_description(): + """Return a description of this provider.""" + return "Provider that only applies to thread ID 1" + + def get_frame_at_index(self, index): + """Return a single synthetic frame.""" + if index == 0: + return {"pc": 0xFFFF} + return None diff --git a/lldb/test/Shell/Unwind/Inputs/call-asm.c b/lldb/test/Shell/Unwind/Inputs/call-asm.c index b154c1ac1385d..778c16b36a761 100644 --- a/lldb/test/Shell/Unwind/Inputs/call-asm.c +++ b/lldb/test/Shell/Unwind/Inputs/call-asm.c @@ -1,3 +1,3 @@ -int asm_main() asm("asm_main"); - +// Explicit mangling is necessary as on Darwin an underscore is prepended to the symbol. +int asm_main() __asm("asm_main"); int main() { return asm_main(); } diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index a63b740d9472f..5694aeeff3e5b 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -136,6 +136,11 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data) { return nullptr; } +void * +lldb_private::python::LLDBSWIGPython_CastPyObjectToSBThread(PyObject *data) { + return nullptr; +} + void * lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrame(PyObject *data) { return nullptr; diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 34ba3a5f60405..5ae1f394896ef 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2747,6 +2747,27 @@ For example: all arguments are not undef and not poison. Otherwise, it is undefined behavior. +``"modular-format"=",,,,,"`` + This attribute indicates that the implementation is modular on a particular + format string argument. If the compiler can determine that not all aspects + of the implementation are needed, it can report which aspects were needed + and redirect the call to a modular implementation function instead. + + The compiler reports that an implementation aspect is needed by issuing a + relocation for the symbol `_``. This arranges for code + and data needed to support the aspect of the implementation to be brought + into the link to satisfy weak references in the modular implemenation + function. + + The first three arguments have the same semantics as the arguments to the C + ``format`` attribute. + + The following aspects are currently supported: + + - ``float``: The call has a floating point argument + + + Call Site Attributes ---------------------- diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index fd78c97c86d24..c76717fdc990c 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -70,6 +70,10 @@ Changes to the LLVM IR * Added `@llvm.reloc.none` intrinsic to emit null relocations to symbols. This emits an undefined symbol reference without adding any dedicated code or data to to bear the relocation. +* Added `modular-format` attribute to dynamically pull in aspects of libc + format string function implementations from statically-linked libc's based on + the requirements of each call. Currently only `float` is supported; this can + keep floating point support out of printf if it can be proven unused. Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index 02581d31145fc..d7bac959cf244 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -18,7 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/StaticDataProfileInfo.h" @@ -207,9 +207,9 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass { using CGTypeId = uint64_t; /// Unique target type IDs. - SmallSet IndirectCalleeTypeIDs; + SmallSetVector IndirectCalleeTypeIDs; /// Unique direct callees. - SmallSet DirectCallees; + SmallSetVector DirectCallees; }; enum CallGraphSectionFormatVersion : uint8_t { diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 1c167af4b0478..a52ad41d0f1b3 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -334,7 +334,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { break; case Intrinsic::sincos: - LC = RTLIB::getSINCOS(ScalarVT); + LC = RTLIB::getSINCOS(VT); + if (LC == RTLIB::UNKNOWN_LIBCALL) + LC = RTLIB::getSINCOS(ScalarVT); + else if (VT.isVector()) + IsVectorCall = true; + break; default: return std::nullopt; diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h index c252f9d99f2af..32027766e7093 100644 --- a/llvm/include/llvm/CodeGen/LiveIntervals.h +++ b/llvm/include/llvm/CodeGen/LiveIntervals.h @@ -412,7 +412,7 @@ class LiveIntervals { /// Return the live range for register unit \p Unit. It will be computed if /// it doesn't exist. - LiveRange &getRegUnit(unsigned Unit) { + LiveRange &getRegUnit(MCRegUnit Unit) { LiveRange *LR = RegUnitRanges[Unit]; if (!LR) { // Compute missing ranges on demand. @@ -425,15 +425,15 @@ class LiveIntervals { /// Return the live range for register unit \p Unit if it has already been /// computed, or nullptr if it hasn't been computed yet. - LiveRange *getCachedRegUnit(unsigned Unit) { return RegUnitRanges[Unit]; } + LiveRange *getCachedRegUnit(MCRegUnit Unit) { return RegUnitRanges[Unit]; } - const LiveRange *getCachedRegUnit(unsigned Unit) const { + const LiveRange *getCachedRegUnit(MCRegUnit Unit) const { return RegUnitRanges[Unit]; } /// Remove computed live range for register unit \p Unit. Subsequent uses /// should rely on on-demand recomputation. - void removeRegUnit(unsigned Unit) { + void removeRegUnit(MCRegUnit Unit) { delete RegUnitRanges[Unit]; RegUnitRanges[Unit] = nullptr; } @@ -489,7 +489,7 @@ class LiveIntervals { void dumpInstrs() const; void computeLiveInRegUnits(); - LLVM_ABI void computeRegUnitRange(LiveRange &, unsigned Unit); + LLVM_ABI void computeRegUnitRange(LiveRange &, MCRegUnit Unit); LLVM_ABI bool computeVirtRegInterval(LiveInterval &); using ShrinkToUsesWorkList = SmallVector, 16>; diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 27b30bd5929ff..6982dae4718d1 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -982,7 +982,7 @@ class MachineRegisterInfo { /// root registers, the root register and all super registers are reserved. /// This currently iterates the register hierarchy and may be slower than /// expected. - LLVM_ABI bool isReservedRegUnit(unsigned Unit) const; + LLVM_ABI bool isReservedRegUnit(MCRegUnit Unit) const; /// isAllocatable - Returns true when PhysReg belongs to an allocatable /// register class and it hasn't been reserved. diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h index d987a5cf1c3df..2893e5ce6647e 100644 --- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -77,23 +77,23 @@ class MBBReachingDefsInfo { AllReachingDefs[MBBNumber].resize(NumRegUnits); } - void append(unsigned MBBNumber, unsigned Unit, int Def) { + void append(unsigned MBBNumber, MCRegUnit Unit, int Def) { AllReachingDefs[MBBNumber][Unit].push_back(Def); } - void prepend(unsigned MBBNumber, unsigned Unit, int Def) { + void prepend(unsigned MBBNumber, MCRegUnit Unit, int Def) { auto &Defs = AllReachingDefs[MBBNumber][Unit]; Defs.insert(Defs.begin(), Def); } - void replaceFront(unsigned MBBNumber, unsigned Unit, int Def) { + void replaceFront(unsigned MBBNumber, MCRegUnit Unit, int Def) { assert(!AllReachingDefs[MBBNumber][Unit].empty()); *AllReachingDefs[MBBNumber][Unit].begin() = Def; } void clear() { AllReachingDefs.clear(); } - ArrayRef defs(unsigned MBBNumber, unsigned Unit) const { + ArrayRef defs(unsigned MBBNumber, MCRegUnit Unit) const { if (AllReachingDefs[MBBNumber].empty()) // Block IDs are not necessarily dense. return ArrayRef(); diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h index ab0d7e334df44..059a3444c609c 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -77,9 +77,9 @@ namespace llvm { struct PhysRegSUOper { SUnit *SU; int OpIdx; - unsigned RegUnit; + MCRegUnit RegUnit; - PhysRegSUOper(SUnit *su, int op, unsigned R) + PhysRegSUOper(SUnit *su, int op, MCRegUnit R) : SU(su), OpIdx(op), RegUnit(R) {} unsigned getSparseSetIndex() const { return RegUnit; } diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 54b3f6a04d241..c85e070cbe084 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -1454,7 +1454,7 @@ LLVM_ABI Printable printReg(Register Reg, /// fp0~st7 - Dual roots. /// /// Usage: OS << printRegUnit(Unit, TRI) << '\n'; -LLVM_ABI Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI); +LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI); /// Create Printable object to print virtual registers and physical /// registers on a \ref raw_ostream. diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h index f611edd715398..e6dbb38dfee67 100644 --- a/llvm/include/llvm/MC/MCRegisterInfo.h +++ b/llvm/include/llvm/MC/MCRegisterInfo.h @@ -687,7 +687,7 @@ class MCRegUnitMaskIterator { } /// Returns a (RegUnit, LaneMask) pair. - std::pair operator*() const { + std::pair operator*() const { return std::make_pair(*RUIter, *MaskListIter); } @@ -719,7 +719,7 @@ class MCRegUnitRootIterator { public: MCRegUnitRootIterator() = default; - MCRegUnitRootIterator(unsigned RegUnit, const MCRegisterInfo *MCRI) { + MCRegUnitRootIterator(MCRegUnit RegUnit, const MCRegisterInfo *MCRI) { assert(RegUnit < MCRI->getNumRegUnits() && "Invalid register unit"); Reg0 = MCRI->RegUnitRoots[RegUnit][0]; Reg1 = MCRI->RegUnitRoots[RegUnit][1]; diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h index 7d204e6db310c..a80122e74a085 100644 --- a/llvm/include/llvm/Support/CodeGen.h +++ b/llvm/include/llvm/Support/CodeGen.h @@ -121,7 +121,13 @@ namespace llvm { }; // Specify what functions should keep the frame pointer. - enum class FramePointerKind { None, NonLeaf, All, Reserved }; + enum class FramePointerKind { + None, + NonLeaf, + All, + Reserved, + NonLeafNoReserve + }; // Specify what type of zeroing callee-used registers. namespace ZeroCallUsedRegs { diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 0e82dd212f34d..11b76cd183108 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -554,15 +554,29 @@ class Triple { return getOSVersion() < VersionTuple(Major, Minor, Micro); } + bool isOSVersionGE(unsigned Major, unsigned Minor = 0, + unsigned Micro = 0) const { + return !isOSVersionLT(Major, Minor, Micro); + } + bool isOSVersionLT(const Triple &Other) const { return getOSVersion() < Other.getOSVersion(); } + bool isOSVersionGE(const Triple &Other) const { + return getOSVersion() >= Other.getOSVersion(); + } + /// Comparison function for checking OS X version compatibility, which handles /// supporting skewed version numbering schemes used by the "darwin" triples. LLVM_ABI bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0, unsigned Micro = 0) const; + bool isMacOSXVersionGE(unsigned Major, unsigned Minor = 0, + unsigned Micro = 0) const { + return !isMacOSXVersionLT(Major, Minor, Micro); + } + /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin" /// and "osx" as OS X triples. bool isMacOSX() const { diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index aad77dce370d8..b03895cfc77d7 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -1,3 +1,7 @@ +if (UNIX AND "${CMAKE_SYSTEM_NAME}" MATCHES "AIX") + set(additional_libs bsd) +endif() + add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp @@ -20,6 +24,7 @@ add_llvm_component_library(LLVMCAS LINK_LIBS ${LLVM_PTHREAD_LIB} + ${additional_libs} LINK_COMPONENTS Support diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index c1365f499dcf5..02ae722b5a56e 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -210,6 +210,9 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { clEnumValN(FramePointerKind::All, "all", "Disable frame pointer elimination"), clEnumValN(FramePointerKind::NonLeaf, "non-leaf", + "Disable frame pointer elimination for non-leaf frame but " + "reserve the register in leaf functions"), + clEnumValN(FramePointerKind::NonLeafNoReserve, "non-leaf-no-reserve", "Disable frame pointer elimination for non-leaf frame"), clEnumValN(FramePointerKind::Reserved, "reserved", "Enable frame pointer elimination, but reserve the frame " @@ -687,6 +690,8 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features, NewAttrs.addAttribute("frame-pointer", "all"); else if (getFramePointerUsage() == FramePointerKind::NonLeaf) NewAttrs.addAttribute("frame-pointer", "non-leaf"); + else if (getFramePointerUsage() == FramePointerKind::NonLeafNoReserve) + NewAttrs.addAttribute("frame-pointer", "non-leaf-no-reserve"); else if (getFramePointerUsage() == FramePointerKind::Reserved) NewAttrs.addAttribute("frame-pointer", "reserved"); else if (getFramePointerUsage() == FramePointerKind::None) diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index da0987c3b50bb..55caa6e8a8f95 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -134,7 +134,7 @@ class SSAIfConv { BitVector ClobberedRegUnits; // Scratch pad for findInsertionPoint. - SparseSet LiveRegUnits; + SparseSet LiveRegUnits; /// Insertion point in Head for speculatively executed instructions form TBB /// and FBB. @@ -421,7 +421,7 @@ bool SSAIfConv::findInsertionPoint() { if (!LiveRegUnits.empty()) { LLVM_DEBUG({ dbgs() << "Would clobber"; - for (unsigned LRU : LiveRegUnits) + for (MCRegUnit LRU : LiveRegUnits) dbgs() << ' ' << printRegUnit(LRU, TRI); dbgs() << " live before " << *I; }); diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index d2f2c3ef33c9c..27c5addffa4ab 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -305,7 +305,7 @@ void LiveIntervals::computeRegMasks() { /// Compute the live range of a register unit, based on the uses and defs of /// aliasing registers. The range should be empty, or contain only dead /// phi-defs from ABI blocks. -void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { +void LiveIntervals::computeRegUnitRange(LiveRange &LR, MCRegUnit Unit) { assert(LICalc && "LICalc not initialized."); LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); @@ -354,7 +354,7 @@ void LiveIntervals::computeLiveInRegUnits() { LLVM_DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n"); // Keep track of the live range sets allocated. - SmallVector NewRanges; + SmallVector NewRanges; // Check all basic blocks for live-ins. for (const MachineBasicBlock &MBB : *MF) { @@ -383,7 +383,7 @@ void LiveIntervals::computeLiveInRegUnits() { LLVM_DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n"); // Compute the 'normal' part of the ranges. - for (unsigned Unit : NewRanges) + for (MCRegUnit Unit : NewRanges) computeRegUnitRange(*RegUnitRanges[Unit], Unit); } @@ -1042,7 +1042,7 @@ class LiveIntervals::HMEditor { // physregs, even those that aren't needed for regalloc, in order to update // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill // flags, and postRA passes will use a live register utility instead. - LiveRange *getRegUnitLI(unsigned Unit) { + LiveRange *getRegUnitLI(MCRegUnit Unit) { if (UpdateFlags && !MRI.isReservedRegUnit(Unit)) return &LIS.getRegUnit(Unit); return LIS.getCachedRegUnit(Unit); diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp index cfda262aac82d..e3ee8dc325933 100644 --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -89,7 +89,7 @@ static bool foreachUnit(const TargetRegisterInfo *TRI, Callable Func) { if (VRegInterval.hasSubRanges()) { for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - unsigned Unit = (*Units).first; + MCRegUnit Unit = (*Units).first; LaneBitmask Mask = (*Units).second; for (const LiveInterval::SubRange &S : VRegInterval.subranges()) { if ((S.LaneMask & Mask).any()) { @@ -115,7 +115,7 @@ void LiveRegMatrix::assign(const LiveInterval &VirtReg, MCRegister PhysReg) { VRM->assignVirt2Phys(VirtReg.reg(), PhysReg); foreachUnit( - TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { + TRI, VirtReg, PhysReg, [&](MCRegUnit Unit, const LiveRange &Range) { LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << ' ' << Range); Matrix[Unit].unify(VirtReg, Range); return false; @@ -132,7 +132,7 @@ void LiveRegMatrix::unassign(const LiveInterval &VirtReg) { VRM->clearVirt(VirtReg.reg()); foreachUnit(TRI, VirtReg, PhysReg, - [&](unsigned Unit, const LiveRange &Range) { + [&](MCRegUnit Unit, const LiveRange &Range) { LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI)); Matrix[Unit].extract(VirtReg, Range); return false; @@ -175,11 +175,11 @@ bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg, return false; CoalescerPair CP(VirtReg.reg(), PhysReg, *TRI); - bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit, - const LiveRange &Range) { - const LiveRange &UnitRange = LIS->getRegUnit(Unit); - return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes()); - }); + bool Result = foreachUnit( + TRI, VirtReg, PhysReg, [&](MCRegUnit Unit, const LiveRange &Range) { + const LiveRange &UnitRange = LIS->getRegUnit(Unit); + return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes()); + }); return Result; } diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 187bff78f236f..5ec7c48d7ee64 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -137,7 +137,7 @@ class CopyTracker { PreservedRegUnits.resize(TRI.getNumRegUnits()); for (unsigned SafeReg = 0, E = TRI.getNumRegs(); SafeReg < E; ++SafeReg) if (!RegMaskOp.clobbersPhysReg(SafeReg)) - for (auto SafeUnit : TRI.regunits(SafeReg)) + for (MCRegUnit SafeUnit : TRI.regunits(SafeReg)) PreservedRegUnits.set(SafeUnit); return PreservedRegUnits; @@ -995,7 +995,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { // Invalidate all entries in the copy map which are not preserved by // this register mask. bool MIRefedinCopyInfo = false; - for (unsigned RegUnit : TRI->regunits(Reg)) { + for (MCRegUnit RegUnit : TRI->regunits(Reg)) { if (!PreservedRegUnits.test(RegUnit)) Tracker.clobberRegUnit(RegUnit, *TRI, *TII, UseCopyInstr); else { diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index ae284f3ae2929..094315b3903ea 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -665,7 +665,7 @@ void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef CSRs) { IsUpdatedCSRsInitialized = true; } -bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { +bool MachineRegisterInfo::isReservedRegUnit(MCRegUnit Unit) const { const TargetRegisterInfo *TRI = getTargetRegisterInfo(); for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { if (all_of(TRI->superregs_inclusive(*Root), diff --git a/llvm/lib/CodeGen/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp index b8d54cadc07f6..1400699a607ff 100644 --- a/llvm/lib/CodeGen/RDFRegisters.cpp +++ b/llvm/lib/CodeGen/RDFRegisters.cpp @@ -58,7 +58,7 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri, UnitInfos[U].Reg = F; } else { for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) { - std::pair P = *I; + std::pair P = *I; UnitInfo &UI = UnitInfos[P.first]; UI.Reg = F; UI.Mask = P.second; @@ -281,9 +281,9 @@ bool RegisterAggr::hasAliasOf(RegisterRef RR) const { return Units.anyCommon(PRI.getMaskUnits(RR.Reg)); for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair P = *U; - if ((P.second & RR.Mask).any()) - if (Units.test(P.first)) + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + if (Units.test(Unit)) return true; } return false; @@ -296,9 +296,9 @@ bool RegisterAggr::hasCoverOf(RegisterRef RR) const { } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair P = *U; - if ((P.second & RR.Mask).any()) - if (!Units.test(P.first)) + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + if (!Units.test(Unit)) return false; } return true; @@ -311,9 +311,9 @@ RegisterAggr &RegisterAggr::insert(RegisterRef RR) { } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair P = *U; - if ((P.second & RR.Mask).any()) - Units.set(P.first); + auto [Unit, LaneMask] = *U; + if ((LaneMask & RR.Mask).any()) + Units.set(Unit); } return *this; } @@ -384,9 +384,9 @@ RegisterRef RegisterAggr::makeRegRef() const { LaneBitmask M; for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) { - std::pair P = *I; - if (Units.test(P.first)) - M |= P.second; + auto [Unit, LaneMask] = *I; + if (Units.test(Unit)) + M |= LaneMask; } return RegisterRef(F, M); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index f5a54497c8a98..78d8ea0676dd7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1268,10 +1268,12 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { return; break; - + case ISD::FSINCOS: case ISD::FSINCOSPI: { EVT VT = Node->getValueType(0); - RTLIB::Libcall LC = RTLIB::getSINCOSPI(VT); + RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS + ? RTLIB::getSINCOS(VT) + : RTLIB::getSINCOSPI(VT); if (LC != RTLIB::UNKNOWN_LIBCALL && DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT)) return; @@ -1280,14 +1282,6 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { // scalarizing. break; } - case ISD::FSINCOS: { - // FIXME: Try to directly match vector case like fsincospi - EVT VT = Node->getValueType(0).getVectorElementType(); - RTLIB::Libcall LC = RTLIB::getSINCOS(VT); - if (DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT)) - return; - break; - } case ISD::FMODF: { EVT VT = Node->getValueType(0).getVectorElementType(); RTLIB::Libcall LC = RTLIB::getMODF(VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 602b589633052..efd55af767746 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -9449,7 +9449,9 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) { bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, unsigned Opcode) { // We already checked this call's prototype; verify it doesn't modify errno. - if (!I.onlyReadsMemory()) + // Do not perform optimizations for call sites that require strict + // floating-point semantics. + if (!I.onlyReadsMemory() || I.isStrictFP()) return false; SDNodeFlags Flags; @@ -9469,7 +9471,9 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I, bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I, unsigned Opcode) { // We already checked this call's prototype; verify it doesn't modify errno. - if (!I.onlyReadsMemory()) + // Do not perform optimizations for call sites that require strict + // floating-point semantics. + if (!I.onlyReadsMemory() || I.isStrictFP()) return false; SDNodeFlags Flags; @@ -9502,11 +9506,10 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { // Check for well-known libc/libm calls. If the function is internal, it // can't be a library call. Don't do the check if marked as nobuiltin for - // some reason or the call site requires strict floating point semantics. + // some reason. LibFunc Func; - if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() && - F->hasName() && LibInfo->getLibFunc(*F, Func) && - LibInfo->hasOptimizedCodeGen(Func)) { + if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() && + LibInfo->getLibFunc(*F, Func) && LibInfo->hasOptimizedCodeGen(Func)) { switch (Func) { default: break; case LibFunc_bcmp: diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 01216552ed260..36a424f1c8b63 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -424,6 +424,24 @@ RTLIB::Libcall RTLIB::getCOS(EVT RetVT) { } RTLIB::Libcall RTLIB::getSINCOS(EVT RetVT) { + // TODO: Tablegen should generate this function + if (RetVT.isVector()) { + if (!RetVT.isSimple()) + return RTLIB::UNKNOWN_LIBCALL; + switch (RetVT.getSimpleVT().SimpleTy) { + case MVT::v4f32: + return RTLIB::SINCOS_V4F32; + case MVT::v2f64: + return RTLIB::SINCOS_V2F64; + case MVT::nxv4f32: + return RTLIB::SINCOS_NXV4F32; + case MVT::nxv2f64: + return RTLIB::SINCOS_NXV2F64; + default: + return RTLIB::UNKNOWN_LIBCALL; + } + } + return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128, SINCOS_PPCF128); } diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index c33bf8b014b55..16d86b42db4a3 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -30,7 +30,7 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { StringRef FP = FPAttr.getValueAsString(); if (FP == "all") return true; - if (FP == "non-leaf") + if (FP == "non-leaf" || FP == "non-leaf-no-reserve") return MF.getFrameInfo().hasCalls(); if (FP == "none" || FP == "reserved") return false; @@ -45,6 +45,7 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const { return StringSwitch(FPAttr.getValueAsString()) .Cases({"all", "non-leaf", "reserved"}, true) + .Case(("non-leaf-no-reserve"), MF.getFrameInfo().hasCalls()) .Case("none", false); } diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 971f822fa6c53..a5c81afc57a80 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -133,7 +133,7 @@ Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI, }); } -Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) { +Printable llvm::printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI) { return Printable([Unit, TRI](raw_ostream &OS) { // Generic printout when TRI is missing. if (!TRI) { diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index fc067459dcba3..31a294447152e 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -396,6 +396,9 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty, case FramePointerKind::NonLeaf: B.addAttribute("frame-pointer", "non-leaf"); break; + case FramePointerKind::NonLeafNoReserve: + B.addAttribute("frame-pointer", "non-leaf-no-reserve"); + break; case FramePointerKind::All: B.addAttribute("frame-pointer", "all"); break; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 6031659e4002e..b953016e8d7ab 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2512,7 +2512,8 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, if (Attribute FPAttr = Attrs.getFnAttr("frame-pointer"); FPAttr.isValid()) { StringRef FP = FPAttr.getValueAsString(); - if (FP != "all" && FP != "non-leaf" && FP != "none" && FP != "reserved") + if (FP != "all" && FP != "non-leaf" && FP != "none" && FP != "reserved" && + FP != "non-leaf-no-reserve") CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V); } @@ -2582,6 +2583,20 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, CheckFailed("invalid value for 'denormal-fp-math-f32' attribute: " + S, V); } + + if (auto A = Attrs.getFnAttr("modular-format"); A.isValid()) { + StringRef S = A.getValueAsString(); + SmallVector Args; + S.split(Args, ','); + Check(Args.size() >= 5, + "modular-format attribute requires at least 5 arguments", V); + unsigned FirstArgIdx; + Check(!Args[2].getAsInteger(10, FirstArgIdx), + "modular-format attribute first arg index is not an integer", V); + unsigned UpperBound = FT->getNumParams() + (FT->isVarArg() ? 1 : 0); + Check(FirstArgIdx > 0 && FirstArgIdx <= UpperBound, + "modular-format attribute first arg index is out of bounds", V); + } } void Verifier::verifyUnknownProfileMetadata(MDNode *MD) { Check(MD->getNumOperands() == 2, diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp index ba9ef00f9f0d8..7fd92bf974b95 100644 --- a/llvm/lib/MC/MCRegisterInfo.cpp +++ b/llvm/lib/MC/MCRegisterInfo.cpp @@ -221,7 +221,7 @@ bool MCRegisterInfo::regsOverlap(MCRegister RegA, MCRegister RegB) const { return false; } -bool MCRegisterInfo::isArtificialRegUnit(unsigned Unit) const { +bool MCRegisterInfo::isArtificialRegUnit(MCRegUnit Unit) const { for (MCRegUnitRootIterator Root(Unit, this); Root.isValid(); ++Root) if (isArtificial(*Root)) return true; diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index a95ccf83a2636..a8535dfa8a5d3 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/SMLoc.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -584,7 +585,12 @@ void MachObjectWriter::computeSymbolTable( unsigned Index = 1; for (MCSection &Sec : Asm) SectionIndexMap[&Sec] = Index++; - assert(Index <= 256 && "Too many sections!"); + + // Section indices begin from 1 in MachO. Only sections 1-255 can be indexed + // into section symbols. Referencing a section with index larger than 255 will + // not set n_sect for these symbols. + if (Index > 255) + getContext().reportError(SMLoc(), "Too many sections!"); // Build the string table. for (const MCSymbol &Symbol : Asm.symbols()) { @@ -621,7 +627,8 @@ void MachObjectWriter::computeSymbolTable( ExternalSymbolData.push_back(MSD); } else { MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection()); - assert(MSD.SectionIndex && "Invalid section index!"); + if (!MSD.SectionIndex) + getContext().reportError(SMLoc(), "Invalid section index!"); ExternalSymbolData.push_back(MSD); } } @@ -645,7 +652,8 @@ void MachObjectWriter::computeSymbolTable( LocalSymbolData.push_back(MSD); } else { MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection()); - assert(MSD.SectionIndex && "Invalid section index!"); + if (!MSD.SectionIndex) + getContext().reportError(SMLoc(), "Invalid section index!"); LocalSymbolData.push_back(MSD); } } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 70c5c29149288..de55704a37531 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1554,8 +1554,10 @@ static bool produceCompactUnwindFrame(const AArch64FrameLowering &AFL, !AFL.requiresSaveVG(MF) && !AFI->isSVECC(); } -static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, - bool NeedsWinCFI, bool IsFirst, +static bool invalidateWindowsRegisterPairing(bool SpillExtendedVolatile, + unsigned SpillCount, unsigned Reg1, + unsigned Reg2, bool NeedsWinCFI, + bool IsFirst, const TargetRegisterInfo *TRI) { // If we are generating register pairs for a Windows function that requires // EH support, then pair consecutive registers only. There are no unwind @@ -1568,8 +1570,18 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, return true; if (!NeedsWinCFI) return false; + + // ARM64EC introduced `save_any_regp`, which expects 16-byte alignment. + // This is handled by only allowing paired spills for registers spilled at + // even positions (which should be 16-byte aligned, as other GPRs/FPRs are + // 8-bytes). We carve out an exception for {FP,LR}, which does not require + // 16-byte alignment in the uop representation. if (TRI->getEncodingValue(Reg2) == TRI->getEncodingValue(Reg1) + 1) - return false; + return SpillExtendedVolatile + ? !((Reg1 == AArch64::FP && Reg2 == AArch64::LR) || + (SpillCount % 2) == 0) + : false; + // If pairing a GPR with LR, the pair can be described by the save_lrpair // opcode. If this is the first register pair, it would end up with a // predecrement, but there's no save_lrpair_x opcode, so we can only do this @@ -1585,12 +1597,15 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, /// WindowsCFI requires that only consecutive registers can be paired. /// LR and FP need to be allocated together when the frame needs to save /// the frame-record. This means any other register pairing with LR is invalid. -static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, - bool UsesWinAAPCS, bool NeedsWinCFI, - bool NeedsFrameRecord, bool IsFirst, +static bool invalidateRegisterPairing(bool SpillExtendedVolatile, + unsigned SpillCount, unsigned Reg1, + unsigned Reg2, bool UsesWinAAPCS, + bool NeedsWinCFI, bool NeedsFrameRecord, + bool IsFirst, const TargetRegisterInfo *TRI) { if (UsesWinAAPCS) - return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst, + return invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount, + Reg1, Reg2, NeedsWinCFI, IsFirst, TRI); // If we need to store the frame record, don't pair any register @@ -1688,6 +1703,21 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, } bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize(); + // Windows AAPCS has x9-x15 as volatile registers, x16-x17 as intra-procedural + // scratch, x18 as platform reserved. However, clang has extended calling + // convensions such as preserve_most and preserve_all which treat these as + // CSR. As such, the ARM64 unwind uOPs bias registers by 19. We use ARM64EC + // uOPs which have separate restrictions. We need to check for that. + // + // NOTE: we currently do not account for the D registers as LLVM does not + // support non-ABI compliant D register spills. + bool SpillExtendedVolatile = + IsWindows && std::any_of(std::begin(CSI), std::end(CSI), + [](const CalleeSavedInfo &CSI) { + const auto &Reg = CSI.getReg(); + return Reg >= AArch64::X0 && + Reg <= AArch64::X18; + }); int ZPRByteOffset = 0; int PPRByteOffset = 0; @@ -1749,17 +1779,19 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL, if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) { MCRegister NextReg = CSI[i + RegInc].getReg(); bool IsFirst = i == FirstReg; + unsigned SpillCount = NeedsWinCFI ? FirstReg - i : i; switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, - NeedsWinCFI, NeedsFrameRecord, IsFirst, - TRI)) + !invalidateRegisterPairing( + SpillExtendedVolatile, SpillCount, RPI.Reg1, NextReg, IsWindows, + NeedsWinCFI, NeedsFrameRecord, IsFirst, TRI)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR64: if (AArch64::FPR64RegClass.contains(NextReg) && - !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + !invalidateWindowsRegisterPairing(SpillExtendedVolatile, SpillCount, + RPI.Reg1, NextReg, NeedsWinCFI, IsFirst, TRI)) RPI.Reg2 = NextReg; break; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8457f6178fdc2..3c6679f3c1fa7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5555,9 +5555,10 @@ SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op, SDLoc DL(Op); SDValue Chain = Op.getOperand(0); - SDValue FPCR_64 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, - {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}); + SDValue FPCR_64 = + DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, + {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, + MVT::i64)}); Chain = FPCR_64.getValue(1); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32, @@ -5643,7 +5644,8 @@ SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op, // Set new value of FPCR. SDValue Ops2[] = { - Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR}; + Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), + FPCR}; return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); } @@ -5666,9 +5668,9 @@ SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op, DAG.getConstant(AArch64::ReservedFPControlBits, DL, MVT::i64)); // Set new value of FPCR. - SDValue Ops2[] = {Chain, - DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), - FPSCRMasked}; + SDValue Ops2[] = { + Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), + FPSCRMasked}; return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); } @@ -7300,17 +7302,19 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, SDValue Compressed = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(), - DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec); + DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, + Vec); // compact fills with 0s, so if our passthru is all 0s, do nothing here. if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) { SDValue Offset = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, - DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask); + DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, + Mask); SDValue IndexMask = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MaskVT, - DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64), Offset); Compressed = @@ -7439,10 +7443,10 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) { DAG.getUNDEF(ExpVT), Exp, Zero); SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1), AArch64SVEPredPattern::all); - SDValue FScale = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT, - DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), - VPg, VX, VExp); + SDValue FScale = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, XVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg, + VX, VExp); SDValue Final = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero); if (X.getValueType() != XScalarTy) @@ -8106,7 +8110,7 @@ static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); SDValue TPIDR2_EL0 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); // Copy the address of the TPIDR2 block into X0 before 'calling' the // RESTORE_ZA pseudo. SDValue Glue; @@ -8121,7 +8125,7 @@ static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, // Finally reset the TPIDR2_EL0 register to 0. Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64)); TPIDR2.Uses++; return Chain; @@ -8716,7 +8720,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Attrs.isNewZT0()) Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32), DAG.getTargetConstant(0, DL, MVT::i32)); } @@ -9529,7 +9533,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), TPIDR2ObjAddr); OptimizationRemarkEmitter ORE(&MF.getFunction()); ORE.emit([&]() { @@ -13421,8 +13425,8 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec, - MaskSourceVec); + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + SourceVec, MaskSourceVec); } // Gather data to see if the operation can be modelled as a @@ -14278,14 +14282,16 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + V1Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), + V1Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we @@ -14296,8 +14302,8 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, - V2Cst, + DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), + V1Cst, V2Cst, DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); } } @@ -16450,10 +16456,10 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getTargetConstant(Cnt, DL, MVT::i32)); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, - MVT::i32), - Op.getOperand(0), Op.getOperand(1)); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32), + Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: if (VT.isScalableVector() && @@ -20049,7 +20055,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, : Intrinsic::aarch64_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, - DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + DAG.getTargetConstant(IntrinsicOpcode, DL, MVT::i32), Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32)); // We can handle smaller integers by generating an extra trunc. if (IntBits < FloatBits) @@ -27338,8 +27344,8 @@ static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, // ...and remap the intrinsic `aarch64_sve_prf_gather_scalar_offset` to // `aarch64_sve_prfb_gather_uxtw_index`. SDLoc DL(N); - Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, - MVT::i64); + Ops[1] = DAG.getTargetConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, + DL, MVT::i64); return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } @@ -31204,10 +31210,10 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, SDValue Shuffle; if (IsSingleOp) - Shuffle = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, - DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), - Op1, SVEMask); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), Op1, + SVEMask); else if (Subtarget.hasSVE2()) { if (!MinMaxEqual) { unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt; @@ -31226,10 +31232,10 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, SVEMask = convertToScalableVector( DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask); } - Shuffle = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, - DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), - Op1, Op2, SVEMask); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1, + Op2, SVEMask); } Shuffle = convertFromScalableVector(DAG, VT, Shuffle); return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); @@ -31389,8 +31395,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( unsigned SegmentElts = VT.getVectorNumElements() / Segments; if (std::optional Lane = isDUPQMask(ShuffleMask, Segments, SegmentElts)) { - SDValue IID = - DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64); + SDValue IID = DAG.getTargetConstant(Intrinsic::aarch64_sve_dup_laneq, + DL, MVT::i64); return convertFromScalableVector( DAG, VT, DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 6273cfc1005d6..f5dfbdc596510 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -88,7 +88,7 @@ class AArch64AsmParser : public MCTargetAsmParser { StringRef Mnemonic; ///< Instruction mnemonic. // Map of register aliases registers via the .req directive. - StringMap> RegisterReqs; + StringMap> RegisterReqs; class PrefixInfo { public: @@ -165,7 +165,7 @@ class AArch64AsmParser : public MCTargetAsmParser { AArch64CC::CondCode parseCondCodeString(StringRef Cond, std::string &Suggestion); bool parseCondCode(OperandVector &Operands, bool invertCondCode); - unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind); + MCRegister matchRegisterNameAlias(StringRef Name, RegKind Kind); bool parseRegister(OperandVector &Operands); bool parseSymbolicImmVal(const MCExpr *&ImmVal); bool parseNeonVectorList(OperandVector &Operands); @@ -391,7 +391,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNum; + MCRegister Reg; RegKind Kind; int ElementWidth; @@ -417,7 +417,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct MatrixRegOp { - unsigned RegNum; + MCRegister Reg; unsigned ElementWidth; MatrixKind Kind; }; @@ -427,7 +427,7 @@ class AArch64Operand : public MCParsedAsmOperand { }; struct VectorListOp { - unsigned RegNum; + MCRegister Reg; unsigned Count; unsigned Stride; unsigned NumElements; @@ -688,12 +688,12 @@ class AArch64Operand : public MCParsedAsmOperand { MCRegister getReg() const override { assert(Kind == k_Register && "Invalid access!"); - return Reg.RegNum; + return Reg.Reg; } - unsigned getMatrixReg() const { + MCRegister getMatrixReg() const { assert(Kind == k_MatrixRegister && "Invalid access!"); - return MatrixReg.RegNum; + return MatrixReg.Reg; } unsigned getMatrixElementWidth() const { @@ -716,9 +716,9 @@ class AArch64Operand : public MCParsedAsmOperand { return Reg.EqualityTy; } - unsigned getVectorListStart() const { + MCRegister getVectorListStart() const { assert(Kind == k_VectorList && "Invalid access!"); - return VectorList.RegNum; + return VectorList.Reg; } unsigned getVectorListCount() const { @@ -1264,15 +1264,15 @@ class AArch64Operand : public MCParsedAsmOperand { bool isNeonVectorRegLo() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( - Reg.RegNum) || + Reg.Reg) || AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains( - Reg.RegNum)); + Reg.Reg)); } bool isNeonVectorReg0to7() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && (AArch64MCRegisterClasses[AArch64::FPR128_0to7RegClassID].contains( - Reg.RegNum)); + Reg.Reg)); } bool isMatrix() const { return Kind == k_MatrixRegister; } @@ -1401,34 +1401,34 @@ class AArch64Operand : public MCParsedAsmOperand { bool isGPR32as64() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && - AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum); + AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.Reg); } bool isGPR64as32() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && - AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum); + AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.Reg); } bool isGPR64x8() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isWSeqPair() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isXSeqPair() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains( - Reg.RegNum); + Reg.Reg); } bool isSyspXzrPair() const { - return isGPR64() && Reg.RegNum == AArch64::XZR; + return isGPR64() && Reg.Reg == AArch64::XZR; } template @@ -1495,7 +1495,7 @@ class AArch64Operand : public MCParsedAsmOperand { isTypedVectorList(); if (!Res) return DiagnosticPredicate::NoMatch; - if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.RegNum)) + if (!AArch64MCRegisterClasses[RegClass].contains(VectorList.Reg)) return DiagnosticPredicate::NearMatch; return DiagnosticPredicate::Match; } @@ -1507,9 +1507,9 @@ class AArch64Operand : public MCParsedAsmOperand { ElementWidth, Stride>(); if (!Res) return DiagnosticPredicate::NoMatch; - if ((VectorList.RegNum < (AArch64::Z0 + Stride)) || - ((VectorList.RegNum >= AArch64::Z16) && - (VectorList.RegNum < (AArch64::Z16 + Stride)))) + if ((VectorList.Reg < (AArch64::Z0 + Stride)) || + ((VectorList.Reg >= AArch64::Z16) && + (VectorList.Reg < (AArch64::Z16 + Stride)))) return DiagnosticPredicate::Match; return DiagnosticPredicate::NoMatch; } @@ -1841,7 +1841,7 @@ class AArch64Operand : public MCParsedAsmOperand { void addPPRorPNRRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - unsigned Reg = getReg(); + MCRegister Reg = getReg(); // Normalise to PPR if (Reg >= AArch64::PN0 && Reg <= AArch64::PN15) Reg = Reg - AArch64::PN0 + AArch64::P0; @@ -2336,13 +2336,12 @@ class AArch64Operand : public MCParsedAsmOperand { } static std::unique_ptr - CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx, + CreateReg(MCRegister Reg, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx, RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg, AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, - unsigned ShiftAmount = 0, - unsigned HasExplicitAmount = false) { + unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) { auto Op = std::make_unique(k_Register, Ctx); - Op->Reg.RegNum = RegNum; + Op->Reg.Reg = Reg; Op->Reg.Kind = Kind; Op->Reg.ElementWidth = 0; Op->Reg.EqualityTy = EqTy; @@ -2354,28 +2353,26 @@ class AArch64Operand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr - CreateVectorReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth, - SMLoc S, SMLoc E, MCContext &Ctx, - AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, - unsigned ShiftAmount = 0, - unsigned HasExplicitAmount = false) { + static std::unique_ptr CreateVectorReg( + MCRegister Reg, RegKind Kind, unsigned ElementWidth, SMLoc S, SMLoc E, + MCContext &Ctx, AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, + unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) { assert((Kind == RegKind::NeonVector || Kind == RegKind::SVEDataVector || Kind == RegKind::SVEPredicateVector || Kind == RegKind::SVEPredicateAsCounter) && "Invalid vector kind"); - auto Op = CreateReg(RegNum, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount, + auto Op = CreateReg(Reg, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount, HasExplicitAmount); Op->Reg.ElementWidth = ElementWidth; return Op; } static std::unique_ptr - CreateVectorList(unsigned RegNum, unsigned Count, unsigned Stride, + CreateVectorList(MCRegister Reg, unsigned Count, unsigned Stride, unsigned NumElements, unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E, MCContext &Ctx) { auto Op = std::make_unique(k_VectorList, Ctx); - Op->VectorList.RegNum = RegNum; + Op->VectorList.Reg = Reg; Op->VectorList.Count = Count; Op->VectorList.Stride = Stride; Op->VectorList.NumElements = NumElements; @@ -2586,10 +2583,10 @@ class AArch64Operand : public MCParsedAsmOperand { } static std::unique_ptr - CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind, + CreateMatrixRegister(MCRegister Reg, unsigned ElementWidth, MatrixKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) { auto Op = std::make_unique(k_MatrixRegister, Ctx); - Op->MatrixReg.RegNum = RegNum; + Op->MatrixReg.Reg = Reg; Op->MatrixReg.ElementWidth = ElementWidth; Op->MatrixReg.Kind = Kind; Op->StartLoc = S; @@ -2660,9 +2657,9 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { break; case k_VectorList: { OS << ""; break; } @@ -2699,7 +2696,7 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { OS << getCMHPriorityHintName(); break; case k_MatrixRegister: - OS << ""; + OS << ""; break; case k_MatrixTileList: { OS << ""; + OS << ""; if (!getShiftExtendAmount() && !hasShiftExtendAmount()) break; [[fallthrough]]; @@ -3048,53 +3045,53 @@ ParseStatus AArch64AsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, } // Matches a register name or register alias previously defined by '.req' -unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, - RegKind Kind) { - unsigned RegNum = 0; - if ((RegNum = matchSVEDataVectorRegName(Name))) - return Kind == RegKind::SVEDataVector ? RegNum : 0; +MCRegister AArch64AsmParser::matchRegisterNameAlias(StringRef Name, + RegKind Kind) { + MCRegister Reg = MCRegister(); + if ((Reg = matchSVEDataVectorRegName(Name))) + return Kind == RegKind::SVEDataVector ? Reg : MCRegister(); - if ((RegNum = matchSVEPredicateVectorRegName(Name))) - return Kind == RegKind::SVEPredicateVector ? RegNum : 0; + if ((Reg = matchSVEPredicateVectorRegName(Name))) + return Kind == RegKind::SVEPredicateVector ? Reg : MCRegister(); - if ((RegNum = matchSVEPredicateAsCounterRegName(Name))) - return Kind == RegKind::SVEPredicateAsCounter ? RegNum : 0; + if ((Reg = matchSVEPredicateAsCounterRegName(Name))) + return Kind == RegKind::SVEPredicateAsCounter ? Reg : MCRegister(); - if ((RegNum = MatchNeonVectorRegName(Name))) - return Kind == RegKind::NeonVector ? RegNum : 0; + if ((Reg = MatchNeonVectorRegName(Name))) + return Kind == RegKind::NeonVector ? Reg : MCRegister(); - if ((RegNum = matchMatrixRegName(Name))) - return Kind == RegKind::Matrix ? RegNum : 0; + if ((Reg = matchMatrixRegName(Name))) + return Kind == RegKind::Matrix ? Reg : MCRegister(); - if (Name.equals_insensitive("zt0")) + if (Name.equals_insensitive("zt0")) return Kind == RegKind::LookupTable ? unsigned(AArch64::ZT0) : 0; // The parsed register must be of RegKind Scalar - if ((RegNum = MatchRegisterName(Name))) - return (Kind == RegKind::Scalar) ? RegNum : 0; + if ((Reg = MatchRegisterName(Name))) + return (Kind == RegKind::Scalar) ? Reg : MCRegister(); - if (!RegNum) { + if (!Reg) { // Handle a few common aliases of registers. - if (auto RegNum = StringSwitch(Name.lower()) - .Case("fp", AArch64::FP) - .Case("lr", AArch64::LR) - .Case("x31", AArch64::XZR) - .Case("w31", AArch64::WZR) - .Default(0)) - return Kind == RegKind::Scalar ? RegNum : 0; + if (MCRegister Reg = StringSwitch(Name.lower()) + .Case("fp", AArch64::FP) + .Case("lr", AArch64::LR) + .Case("x31", AArch64::XZR) + .Case("w31", AArch64::WZR) + .Default(0)) + return Kind == RegKind::Scalar ? Reg : MCRegister(); // Check for aliases registered via .req. Canonicalize to lower case. // That's more consistent since register names are case insensitive, and // it's how the original entry was passed in from MC/MCParser/AsmParser. auto Entry = RegisterReqs.find(Name.lower()); if (Entry == RegisterReqs.end()) - return 0; + return MCRegister(); - // set RegNum if the match is the right kind of register + // set Reg if the match is the right kind of register if (Kind == Entry->getValue().first) - RegNum = Entry->getValue().second; + Reg = Entry->getValue().second; } - return RegNum; + return Reg; } unsigned AArch64AsmParser::getNumRegsForRegKind(RegKind K) { @@ -3122,8 +3119,8 @@ ParseStatus AArch64AsmParser::tryParseScalarRegister(MCRegister &RegNum) { return ParseStatus::NoMatch; std::string lowerCase = Tok.getString().lower(); - unsigned Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar); - if (Reg == 0) + MCRegister Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar); + if (!Reg) return ParseStatus::NoMatch; RegNum = Reg; @@ -3667,7 +3664,7 @@ ParseStatus AArch64AsmParser::tryParseMatrixRegister(OperandVector &Operands) { } // Try to parse matrix register. - unsigned Reg = matchRegisterNameAlias(Name, RegKind::Matrix); + MCRegister Reg = matchRegisterNameAlias(Name, RegKind::Matrix); if (!Reg) return ParseStatus::NoMatch; @@ -4130,12 +4127,12 @@ bool AArch64AsmParser::parseSyslAlias(StringRef Name, SMLoc NameLoc, SMLoc startLoc = getLoc(); const AsmToken ®Tok = getTok(); StringRef reg = regTok.getString(); - unsigned RegNum = matchRegisterNameAlias(reg.lower(), RegKind::Scalar); - if (!RegNum) + MCRegister Reg = matchRegisterNameAlias(reg.lower(), RegKind::Scalar); + if (!Reg) return TokError("expected register operand"); Operands.push_back(AArch64Operand::CreateReg( - RegNum, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg)); + Reg, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg)); Lex(); // Eat token if (parseToken(AsmToken::Comma)) @@ -4453,7 +4450,7 @@ ParseStatus AArch64AsmParser::tryParseVectorRegister(MCRegister &Reg, // a '.'. size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); - unsigned RegNum = matchRegisterNameAlias(Head, MatchKind); + MCRegister RegNum = matchRegisterNameAlias(Head, MatchKind); if (RegNum) { if (Next != StringRef::npos) { @@ -4937,13 +4934,13 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) { const AsmToken &Tok = getTok(); std::string Name = Tok.getString().lower(); - unsigned RegNum = matchRegisterNameAlias(Name, RegKind::LookupTable); + MCRegister Reg = matchRegisterNameAlias(Name, RegKind::LookupTable); - if (RegNum == 0) + if (!Reg) return ParseStatus::NoMatch; Operands.push_back(AArch64Operand::CreateReg( - RegNum, RegKind::LookupTable, StartLoc, getLoc(), getContext())); + Reg, RegKind::LookupTable, StartLoc, getLoc(), getContext())); Lex(); // Eat register. // Check if register is followed by an index @@ -7651,7 +7648,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { if (parseEOL()) return true; - auto pair = std::make_pair(RegisterKind, (unsigned) RegNum); + auto pair = std::make_pair(RegisterKind, RegNum); if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair) Warning(L, "ignoring redefinition of register alias '" + Name + "'"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 0b61adf409948..b008354cfd462 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2976,15 +2976,46 @@ def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">, def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">, AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>; +def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">, + AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>; + +def isWave32 : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(any_of FeatureWavefrontSize32, + FeatureAssemblerPermissiveWavesize)>; +def isWave64 : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(any_of FeatureWavefrontSize64, + FeatureAssemblerPermissiveWavesize)>; + +def isWave32Strict : Predicate<"Subtarget->isWave32()">, + AssemblerPredicate <(all_of FeatureWavefrontSize32)>; +def isWave64Strict : Predicate<"Subtarget->isWave64()">, + AssemblerPredicate <(all_of FeatureWavefrontSize64)>; + //===----------------------------------------------------------------------===// // HwModes //===----------------------------------------------------------------------===// -// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement +defvar DefaultMode_Wave64 = DefaultMode; +defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>; + +// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied +// wave64. def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>; // gfx1250, has alignment requirement but no AGPRs. -def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>; +def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>; +def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>; + +// FIXME: This should be able to only define a separate hwmode that +// only depends on wavesize for just ValueTypes. These use different +// HwMode namespaces. If we don't define the full set of modes used +// for RegClassByHwMode, tablegen crashes for some reason +def WaveSizeVT : ValueTypeByHwMode<[ + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>; // Include AMDGPU TD files diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index dc8fa7f0eef49..1765d054a3c0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -873,6 +873,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Sgpr128: case Vgpr128: return LLT::scalar(128); + case SgprP0: case VgprP0: return LLT::pointer(0, 64); case SgprP1: @@ -887,6 +888,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case SgprP5: case VgprP5: return LLT::pointer(5, 32); + case SgprP8: + return LLT::pointer(8, 128); case SgprV2S16: case VgprV2S16: case UniInVgprV2S16: @@ -972,10 +975,12 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case Sgpr32_WF: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprPtr32: case SgprPtr64: case SgprPtr128: @@ -1055,10 +1060,12 @@ void RegBankLegalizeHelper::applyMappingDst( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: @@ -1198,10 +1205,12 @@ void RegBankLegalizeHelper::applyMappingSrc( case Sgpr32: case Sgpr64: case Sgpr128: + case SgprP0: case SgprP1: case SgprP3: case SgprP4: case SgprP5: + case SgprP8: case SgprV2S16: case SgprV2S32: case SgprV4S32: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 1e5885a25c195..615b911a22903 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -66,6 +66,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64); case P5: return MRI.getType(Reg) == LLT::pointer(5, 32); + case P8: + return MRI.getType(Reg) == LLT::pointer(8, 128); case Ptr32: return isAnyPtr(MRI.getType(Reg), 32); case Ptr64: @@ -108,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg); case UniP5: return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg); + case UniP8: + return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg); case UniPtr32: return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg); case UniPtr64: @@ -918,6 +922,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) .Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}}); + + addRulesForGOpcs({G_GLOBAL_VALUE}) + .Any({{UniP0}, {{SgprP0}, {}}}) + .Any({{UniP1}, {{SgprP1}, {}}}) + .Any({{UniP3}, {{SgprP3}, {}}}) + .Any({{UniP4}, {{SgprP4}, {}}}) + .Any({{UniP8}, {{SgprP8}, {}}}); + bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index e6df5d87a2edc..7e4ce7b43dc3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -63,6 +63,7 @@ enum UniformityLLTOpPredicateID { P3, P4, P5, + P8, Ptr32, Ptr64, Ptr128, @@ -72,6 +73,7 @@ enum UniformityLLTOpPredicateID { UniP3, UniP4, UniP5, + UniP8, UniPtr32, UniPtr64, UniPtr128, @@ -136,10 +138,12 @@ enum RegBankLLTMappingApplyID { Sgpr32, Sgpr64, Sgpr128, + SgprP0, SgprP1, SgprP3, SgprP4, SgprP5, + SgprP8, SgprPtr32, SgprPtr64, SgprPtr128, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 125e212a1b946..cdb6698842f41 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost( const RegisterBank & AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const { - if (&RC == &AMDGPU::SReg_1RegClass) - return AMDGPU::VCCRegBank; - // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a // VCC-like use. if (TRI->isSGPRClass(&RC)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 1c1a6dac75a17..c37d3096afd3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR", >; // It is helpful to distinguish conditions from ordinary SGPRs. -def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; +def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>; def AGPRRegBank : RegisterBank <"AGPR", [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024] diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 09338c533fdf2..5e0486aa1dd49 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1865,7 +1865,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; - unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const; bool isSupportedMnemo(StringRef Mnemo, const FeatureBitset &FBS); @@ -3665,7 +3665,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const { return ""; } -unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { +MCRegister +AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (MCPhysReg Reg : Desc.implicit_uses()) { switch (Reg) { @@ -3679,7 +3680,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { break; } } - return AMDGPU::NoRegister; + return MCRegister(); } // NB: This code is correct only when used to check constant @@ -3854,9 +3855,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations( LiteralSize = 4; } - SmallDenseSet SGPRsUsed; - unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); - if (SGPRUsed != AMDGPU::NoRegister) { + SmallDenseSet SGPRsUsed; + MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst); + if (SGPRUsed) { SGPRsUsed.insert(SGPRUsed); ++ConstantBusUseCount; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 207c1da56ca59..8ef5874d7baf9 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -795,14 +795,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, // Intention: print disassembler message when invalid code is decoded, // for example sgpr register used in VReg or VISrc(VReg or imm) operand. const MCOperandInfo &OpInfo = Desc.operands()[OpNo]; - int16_t RCID = MII.getOpRegClassID( - OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); - if (RCID != -1) { + if (OpInfo.RegClass != -1) { + int16_t RCID = MII.getOpRegClassID( + OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo)); const MCRegisterClass &RC = MRI.getRegClass(RCID); auto Reg = mc2PseudoReg(Op.getReg()); if (!RC.contains(Reg) && !isInlineValue(Reg)) { - O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) - << "\' register class*/"; + bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() && + (OpInfo.RegClass == AMDGPU::SReg_1 || + OpInfo.RegClass == AMDGPU::SReg_1_XEXEC); + // Suppress this comment for a mismatched wavesize. Some users expect to + // be able to assemble and disassemble modules with mixed wavesizes, but + // we do not know the subtarget in different functions in MC. + // + // TODO: Should probably print it anyway, maybe a more specific version. + if (!IsWaveSizeOp) { + O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC) + << "\' register class*/"; + } } } } else if (Op.isImm()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 0bde5d3fd2f26..42e73ec070c15 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -6,13 +6,6 @@ // //===----------------------------------------------------------------------===// -def isWave32 : Predicate<"Subtarget->isWave32()">, - AssemblerPredicate <(any_of FeatureWavefrontSize32, - FeatureAssemblerPermissiveWavesize)>; -def isWave64 : Predicate<"Subtarget->isWave64()">, - AssemblerPredicate <(any_of FeatureWavefrontSize64, - FeatureAssemblerPermissiveWavesize)>; - class AMDGPUMnemonicAlias : MnemonicAlias, PredicateControl; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 54b584ea66965..5ec6e34819a0d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -791,6 +791,17 @@ def : GCNPat< (SI_CALL_ISEL $src0, (i64 0)) >; +// Funnel shift right (fshr) patterns for uniform inputs. +// These patterns implement this using scalar instructions by constructing a 64-bit +// value {a, b} and performing a single right shift. +def : GCNPat<(UniformTernaryFrag i32:$src0, i32:$src1, i32:$src2), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0)) +>; + +def : GCNPat<(UniformTernaryFrag i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)), + (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0)) +>; + // Wrapper around s_swappc_b64 with extra $callee parameter to track // the called function after regalloc. def SI_CALL : SPseudoInstSI < diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 96131bd591a17..9b710013a09ce 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { assert(Changed || ConstrainRegs.empty()); for (Register Reg : ConstrainRegs) - MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); + MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass()); ConstrainRegs.clear(); return Changed; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index dde425e183a2f..40c0dac33de7b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3943,18 +3943,11 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { - unsigned SrcSize = getRegSizeInBits(*SrcRC); - unsigned DstSize = getRegSizeInBits(*DstRC); + // TODO: This should be more aggressive, but be more cautious with very wide + // tuples. unsigned NewSize = getRegSizeInBits(*NewRC); - - // Do not increase size of registers beyond dword, we would need to allocate - // adjacent registers and constraint regalloc more than needed. - - // Always allow dword coalescing. - if (SrcSize <= 32 || DstSize <= 32) - return true; - - return NewSize <= DstSize || NewSize <= SrcSize; + return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) || + NewSize <= getRegSizeInBits(*DstRC); } unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, @@ -4117,13 +4110,10 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { : &AMDGPU::VReg_64RegClass; } +// FIXME: This should be deleted const TargetRegisterClass * SIRegisterInfo::getRegClass(unsigned RCID) const { switch ((int)RCID) { - case AMDGPU::SReg_1RegClassID: - return getBoolRC(); - case AMDGPU::SReg_1_XEXECRegClassID: - return getWaveMaskRegClass(); case -1: return nullptr; default: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index fc8f46a0d2b93..abe12c17ae76c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v let Size = 64; } -def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_64_XEXEC, SReg_32_XEXEC)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - -def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, - (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> { - let CopyCost = 1; - let isAllocatable = 0; - let HasSGPR = 1; -} - multiclass SRegClass regTypes, SIRegisterTuples regList, @@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>; defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>; } +def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_64_XEXEC, + SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0? + SReg_32_XM0_XEXEC] +>; + +def SReg_1 : SIRegisterClassLike<0, false, false, true>, + RegClassByHwMode< + [DefaultMode_Wave64, + AlignedVGPRNoAGPRMode_Wave64, + AVAlign2LoadStoreMode, + DefaultMode_Wave32, + AlignedVGPRNoAGPRMode_Wave32], + [SReg_64, + SReg_64, + SReg_64, + SReg_32, + SReg_32] +>; + //===----------------------------------------------------------------------===// // // AlignTarget classes. Artifical classes to swap between @@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102 // //===----------------------------------------------------------------------===// +// We have 3 orthogonal properties to consider. Unfortunately we need +// to define the cross product of these states, minus unused +// combinations. + def AV_LdSt_32_Target : RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> { + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], + [VGPR_32, + VGPR_32, + AV_32, + VGPR_32, + VGPR_32]>, + SIRegisterClassLike<32, true, true> { let DecoderMethod = "decodeAVLdSt"; } foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in { def VReg_#RegSize#_AlignTarget : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, + DefaultMode_Wave32, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast("VReg_"#RegSize), + !cast("VReg_"#RegSize), + !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass"; @@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def AReg_#RegSize#_AlignTarget : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/], + [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/], [!cast("AReg_"#RegSize), + /*unused combination*/ !cast("AReg_"#RegSize#_Align2) + /*Unused combination*/ /*Unused combination*/]> { let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass"; } def AV_#RegSize#_AlignTarget : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave32, + DefaultMode_Wave64, + AVAlign2LoadStoreMode, + AlignedVGPRNoAGPRMode_Wave64, + AlignedVGPRNoAGPRMode_Wave32], [!cast("AV_"#RegSize), + !cast("AV_"#RegSize), !cast("AV_"#RegSize#_Align2), + !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass"; } def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast("VReg_"#RegSize), + !cast("VReg_"#RegSize), !cast("AV_"#RegSize#_Align2), + !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast("VReg_"#RegSize#_Align2), + !cast("VReg_"#RegSize#_Align2), !cast("AV_"#RegSize#_Align2), + !cast("VReg_"#RegSize#_Align2), !cast("VReg_"#RegSize#_Align2)]> { let DecoderMethod = "decodeAVLdSt"; } def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], [!cast("VReg_"#RegSize), + !cast("VReg_"#RegSize), !cast("AV_"#RegSize), + !cast("VReg_"#RegSize), !cast("VReg_"#RegSize)]> { let DecoderMethod = "decodeAVLdSt"; } @@ -1276,8 +1323,8 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10 def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>, RegClassByHwMode< - [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode], - [VS_64, VS_64_Align2, VS_64_Align2]> { + [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32], + [VS_64, VS_64, VS_64_Align2, VS_64_Align2, VS_64_Align2]> { let DecoderMethod = "decodeSrcRegOrImm9"; } diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 2730ec52294e9..a829b807f33e8 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1233,18 +1233,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith() // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place. multiclass ICMP_Pattern { - let WaveSizePredicate = isWave64 in def : GCNPat < - (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i64 (COPY_TO_REGCLASS dstInst, SReg_64)) + (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + dstInst >; let WaveSizePredicate = isWave32 in { - def : GCNPat < - (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i32 (COPY_TO_REGCLASS dstInst, SReg_32)) - >; - // Support codegen of i64 setcc in wave32 mode. def : GCNPat < (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 94bb60861d670..6b1898f8202c1 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -2207,12 +2207,12 @@ include "VOP3PInstructions.td" include "VOPDInstructions.td" class ClassPat : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask)) >; class ClassPat_t16 : GCNPat < - (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))), (inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask)) >; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 01fe13b343926..f8196e460ae9c 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1238,7 +1238,7 @@ uint64_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( // Verify standard frame (lr/r7) was used. if (CFARegister != ARM::R7) { DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is " - << CFARegister + << CFARegister.id() << " instead of r7\n"); return CU::UNWIND_ARM_MODE_DWARF; } diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index fc794c4968b8c..48452f6d9391c 100644 --- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -252,7 +252,7 @@ class AVROperand : public MCParsedAsmOperand { O << "Token: \"" << getToken() << "\""; break; case k_Register: - O << "Register: " << getReg(); + O << "Register: " << getReg().id(); break; case k_Immediate: O << "Immediate: \""; @@ -262,7 +262,7 @@ class AVROperand : public MCParsedAsmOperand { case k_Memri: { // only manually print the size for non-negative values, // as the sign is inserted automatically. - O << "Memri: \"" << getReg() << '+'; + O << "Memri: \"" << getReg().id() << '+'; MAI.printExpr(O, *getImm()); O << "\""; break; diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp index 5548ad1ebff5e..84a64ba0aa4ff 100644 --- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp +++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp @@ -82,7 +82,7 @@ static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo, if (RegNo > 31) return MCDisassembler::Fail; - unsigned Register = GPRDecoderTable[RegNo]; + MCRegister Register = GPRDecoderTable[RegNo]; Inst.addOperand(MCOperand::createReg(Register)); return MCDisassembler::Success; } @@ -174,7 +174,7 @@ static DecodeStatus decodeLoadStore(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { // Get the register will be loaded or stored. - unsigned RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f]; + MCRegister RegVal = GPRDecoderTable[(Insn >> 4) & 0x1f]; // Decode LDD/STD with offset less than 8. if ((Insn & 0xf000) == 0x8000) { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp index 4bb16e237db48..fbb130ccde681 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp @@ -96,7 +96,7 @@ AVRMCCodeEmitter::loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue, EncodedValue |= (1 << 12); // Encode the pointer register. - switch (MI.getOperand(Idx).getReg()) { + switch (MI.getOperand(Idx).getReg().id()) { case AVR::R27R26: EncodedValue |= 0xc; break; diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp index d96f403d2f814..9f86322a81b3e 100644 --- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp +++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp @@ -172,7 +172,7 @@ struct BPFOperand : public MCParsedAsmOperand { break; case Register: OS << ""; + OS << getReg().id() << ">"; break; case Token: OS << "'" << getToken() << "'"; diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index b94b1484205ae..c18db982bfd97 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -463,7 +463,7 @@ void HexagonOperand::print(raw_ostream &OS, const MCAsmInfo &MAI) const { break; case Register: OS << ""; + OS << getReg().id() << ">"; break; case Token: OS << "'" << getToken() << "'"; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index 9b6bc5ade379d..0b2279bb2cfe6 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -385,7 +385,7 @@ bool HexagonMCChecker::checkSlots() { bool HexagonMCChecker::checkPredicates() { // Check for proper use of new predicate registers. for (const auto &I : NewPreds) { - unsigned P = I; + MCRegister P = I; if (!Defs.count(P) || LatePreds.count(P) || Defs.count(Hexagon::P3_0)) { // Error out if the new predicate register is not defined, @@ -398,7 +398,7 @@ bool HexagonMCChecker::checkPredicates() { // Check for proper use of auto-anded of predicate registers. for (const auto &I : LatePreds) { - unsigned P = I; + MCRegister P = I; if (LatePreds.count(P) > 1 || Defs.count(P)) { // Error out if predicate register defined "late" multiple times or @@ -607,7 +607,7 @@ void HexagonMCChecker::checkRegisterCurDefs() { bool HexagonMCChecker::checkRegisters() { // Check for proper register definitions. for (const auto &I : Defs) { - unsigned R = I.first; + MCRegister R = I.first; if (isLoopRegister(R) && Defs.count(R) > 1 && (HexagonMCInstrInfo::isInnerLoop(MCB) || @@ -620,8 +620,8 @@ bool HexagonMCChecker::checkRegisters() { if (SoftDefs.count(R)) { // Error out for explicit changes to registers also weakly defined // (e.g., "{ usr = r0; r0 = sfadd(...) }"). - unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:. - unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + MCRegister UsrR = Hexagon::USR; + MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; reportErrorRegisters(BadR); return false; } @@ -633,8 +633,8 @@ bool HexagonMCChecker::checkRegisters() { if (PM.count(Unconditional)) { // Error out on an unconditional change when there are any other // changes, conditional or not. - unsigned UsrR = Hexagon::USR; - unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + MCRegister UsrR = Hexagon::USR; + MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; reportErrorRegisters(BadR); return false; } @@ -664,7 +664,7 @@ bool HexagonMCChecker::checkRegisters() { // Check for use of temporary definitions. for (const auto &I : TmpDefs) { - unsigned R = I; + MCRegister R = I; if (!Uses.count(R)) { // special case for vhist @@ -765,12 +765,12 @@ void HexagonMCChecker::compoundRegisterMap(unsigned &Register) { } } -void HexagonMCChecker::reportErrorRegisters(unsigned Register) { +void HexagonMCChecker::reportErrorRegisters(MCRegister Register) { reportError("register `" + Twine(RI.getName(Register)) + "' modified more than once"); } -void HexagonMCChecker::reportErrorNewValue(unsigned Register) { +void HexagonMCChecker::reportErrorNewValue(MCRegister Register) { reportError("register `" + Twine(RI.getName(Register)) + "' used with `.new' " "but not validly modified in the same packet"); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index e9b87c5315fe4..8beee8d7ec8eb 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -39,41 +39,41 @@ class HexagonMCChecker { bool ReportErrors; /// Set of definitions: register #, if predicated, if predicated true. - using PredSense = std::pair; + using PredSense = std::pair; static const PredSense Unconditional; using PredSet = std::multiset; using PredSetIterator = std::multiset::iterator; - using DefsIterator = DenseMap::iterator; - DenseMap Defs; + using DefsIterator = DenseMap::iterator; + DenseMap Defs; /// Set of weak definitions whose clashes should be enforced selectively. - using SoftDefsIterator = std::set::iterator; - std::set SoftDefs; + using SoftDefsIterator = std::set::iterator; + std::set SoftDefs; /// Set of temporary definitions not committed to the register file. - using TmpDefsIterator = std::set::iterator; - std::set TmpDefs; + using TmpDefsIterator = std::set::iterator; + std::set TmpDefs; /// Set of new predicates used. - using NewPredsIterator = std::set::iterator; - std::set NewPreds; + using NewPredsIterator = std::set::iterator; + std::set NewPreds; /// Set of predicates defined late. - using LatePredsIterator = std::multiset::iterator; - std::multiset LatePreds; + using LatePredsIterator = std::multiset::iterator; + std::multiset LatePreds; /// Set of uses. - using UsesIterator = std::set::iterator; - std::set Uses; + using UsesIterator = std::set::iterator; + std::set Uses; /// Pre-defined set of read-only registers. - using ReadOnlyIterator = std::set::iterator; - std::set ReadOnly; + using ReadOnlyIterator = std::set::iterator; + std::set ReadOnly; // Contains the vector-pair-registers with the even number // first ("v0:1", e.g.) used/def'd in this packet. - std::set ReversePairs; + std::set ReversePairs; void init(); void init(MCInst const &); @@ -107,7 +107,7 @@ class HexagonMCChecker { static void compoundRegisterMap(unsigned &); - bool isLoopRegister(unsigned R) const { + bool isLoopRegister(MCRegister R) const { return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R || Hexagon::LC1 == R); } @@ -120,8 +120,8 @@ class HexagonMCChecker { MCSubtargetInfo const &STI, bool CopyReportErrors); bool check(bool FullCheck = true); - void reportErrorRegisters(unsigned Register); - void reportErrorNewValue(unsigned Register); + void reportErrorRegisters(MCRegister Register); + void reportErrorNewValue(MCRegister Register); void reportError(SMLoc Loc, Twine const &Msg); void reportNote(SMLoc Loc, Twine const &Msg); void reportError(Twine const &Msg); diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index cef77f1c512f6..0444c865f6866 100644 --- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -559,7 +559,7 @@ struct LanaiOperand : public MCParsedAsmOperand { OS << "Token: " << getToken() << "\n"; break; case REGISTER: - OS << "Reg: %r" << getReg() << "\n"; + OS << "Reg: %r" << getReg().id() << "\n"; break; case MEMORY_IMM: OS << "MemImm: "; @@ -567,14 +567,14 @@ struct LanaiOperand : public MCParsedAsmOperand { OS << '\n'; break; case MEMORY_REG_IMM: - OS << "MemRegImm: " << getMemBaseReg() << "+"; + OS << "MemRegImm: " << getMemBaseReg().id() << "+"; MAI.printExpr(OS, *getMemOffset()); OS << '\n'; break; case MEMORY_REG_REG: assert(getMemOffset() == nullptr); - OS << "MemRegReg: " << getMemBaseReg() << "+" - << "%r" << getMemOffsetReg() << "\n"; + OS << "MemRegReg: " << getMemBaseReg().id() << "+" + << "%r" << getMemOffsetReg().id() << "\n"; break; } } diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp index a31c8ec1b2bb5..a8891d686abe8 100644 --- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp +++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp @@ -230,7 +230,7 @@ class MSP430Operand : public MCParsedAsmOperand { O << "Token " << Tok; break; case k_Reg: - O << "Register " << Reg; + O << "Register " << Reg.id(); break; case k_Imm: O << "Immediate "; @@ -241,10 +241,10 @@ class MSP430Operand : public MCParsedAsmOperand { MAI.printExpr(O, *Mem.Offset); break; case k_IndReg: - O << "RegInd " << Reg; + O << "RegInd " << Reg.id(); break; case k_PostIndReg: - O << "PostInc " << Reg; + O << "PostInc " << Reg.id(); break; } } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h index 0c9c3bc51f433..8f2ad48efa9d7 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h @@ -16,11 +16,12 @@ #include "MCTargetDesc/SPIRVBaseInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCRegister.h" namespace llvm { class SPIRVInstPrinter : public MCInstPrinter { private: - SmallDenseMap ExtInstSetIDs; + SmallDenseMap ExtInstSetIDs; void recordOpExtInstImport(const MCInst *MI); public: diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp index 275165d2acb07..a24543b699ab4 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp @@ -218,7 +218,7 @@ void SystemZInstPrinterCommon::printBDXAddrOperand(const MCInst *MI, int OpNum, void SystemZInstPrinterCommon::printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O) { - unsigned Base = MI->getOperand(OpNum).getReg(); + MCRegister Base = MI->getOperand(OpNum).getReg(); const MCOperand &DispMO = MI->getOperand(OpNum + 1); uint64_t Length = MI->getOperand(OpNum + 2).getImm(); printOperand(DispMO, &MAI, O); @@ -232,9 +232,9 @@ void SystemZInstPrinterCommon::printBDLAddrOperand(const MCInst *MI, int OpNum, void SystemZInstPrinterCommon::printBDRAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O) { - unsigned Base = MI->getOperand(OpNum).getReg(); + MCRegister Base = MI->getOperand(OpNum).getReg(); const MCOperand &DispMO = MI->getOperand(OpNum + 1); - unsigned Length = MI->getOperand(OpNum + 2).getReg(); + MCRegister Length = MI->getOperand(OpNum + 2).getReg(); printOperand(DispMO, &MAI, O); O << "("; printRegName(O, Length); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 66ed8b078b808..9d8e09c09e9ea 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -988,20 +988,36 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) { bool WebAssemblyFastISel::selectTrunc(const Instruction *I) { const auto *Trunc = cast(I); - Register Reg = getRegForValue(Trunc->getOperand(0)); - if (Reg == 0) + const Value *Op = Trunc->getOperand(0); + MVT::SimpleValueType From = getSimpleType(Op->getType()); + MVT::SimpleValueType To = getLegalType(getSimpleType(Trunc->getType())); + Register In = getRegForValue(Op); + if (In == 0) return false; - unsigned FromBitWidth = Trunc->getOperand(0)->getType()->getIntegerBitWidth(); - unsigned ToBitWidth = Trunc->getType()->getIntegerBitWidth(); + auto Truncate = [&](Register Reg) -> unsigned { + if (From == MVT::i64) { + if (To == MVT::i64) + return copyValue(Reg); + + if (To == MVT::i1 || To == MVT::i8 || To == MVT::i16 || To == MVT::i32) { + Register Result = createResultReg(&WebAssembly::I32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, + TII.get(WebAssembly::I32_WRAP_I64), Result) + .addReg(Reg); + return Result; + } + } - if (ToBitWidth <= 32 && (32 < FromBitWidth && FromBitWidth <= 64)) { - Register Result = createResultReg(&WebAssembly::I32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, - TII.get(WebAssembly::I32_WRAP_I64), Result) - .addReg(Reg); - Reg = Result; - } + if (From == MVT::i32) + return copyValue(Reg); + + return 0; + }; + + unsigned Reg = Truncate(In); + if (Reg == 0) + return false; updateValueMap(Trunc, Reg); return true; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 92fca90ddb88a..8e4edefec42fd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" #include "llvm/Analysis/AssumptionCache.h" @@ -4091,6 +4092,70 @@ Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) { return visitCallBase(CBI); } +static Value *optimizeModularFormat(CallInst *CI, IRBuilderBase &B) { + if (!CI->hasFnAttr("modular-format")) + return nullptr; + + SmallVector Args( + llvm::split(CI->getFnAttr("modular-format").getValueAsString(), ',')); + // TODO: Make use of the first two arguments + unsigned FirstArgIdx; + [[maybe_unused]] bool Error; + Error = Args[2].getAsInteger(10, FirstArgIdx); + assert(!Error && "invalid first arg index"); + --FirstArgIdx; + StringRef FnName = Args[3]; + StringRef ImplName = Args[4]; + ArrayRef AllAspects = ArrayRef(Args).drop_front(5); + + if (AllAspects.empty()) + return nullptr; + + SmallVector NeededAspects; + for (StringRef Aspect : AllAspects) { + if (Aspect == "float") { + if (llvm::any_of( + llvm::make_range(std::next(CI->arg_begin(), FirstArgIdx), + CI->arg_end()), + [](Value *V) { return V->getType()->isFloatingPointTy(); })) + NeededAspects.push_back("float"); + } else { + // Unknown aspects are always considered to be needed. + NeededAspects.push_back(Aspect); + } + } + + if (NeededAspects.size() == AllAspects.size()) + return nullptr; + + Module *M = CI->getModule(); + LLVMContext &Ctx = M->getContext(); + Function *Callee = CI->getCalledFunction(); + FunctionCallee ModularFn = M->getOrInsertFunction( + FnName, Callee->getFunctionType(), + Callee->getAttributes().removeFnAttribute(Ctx, "modular-format")); + CallInst *New = cast(CI->clone()); + New->setCalledFunction(ModularFn); + New->removeFnAttr("modular-format"); + B.Insert(New); + + const auto ReferenceAspect = [&](StringRef Aspect) { + SmallString<20> Name = ImplName; + Name += '_'; + Name += Aspect; + Function *RelocNoneFn = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::reloc_none); + B.CreateCall(RelocNoneFn, + {MetadataAsValue::get(Ctx, MDString::get(Ctx, Name))}); + }; + + llvm::sort(NeededAspects); + for (StringRef Request : NeededAspects) + ReferenceAspect(Request); + + return New; +} + Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { if (!CI->getCalledFunction()) return nullptr; @@ -4112,6 +4177,10 @@ Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { ++NumSimplified; return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); } + if (Value *With = optimizeModularFormat(CI, Builder)) { + ++NumSimplified; + return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); + } return nullptr; } diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 0688bc7ac08eb..726d94b27a7f2 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -1992,6 +1992,8 @@ void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond, // Use logical and to avoid propagating poison from later conditions. MergedCondition = IRB.CreateLogicalAnd(MergedCondition, Cond); + setExplicitlyUnknownBranchWeightsIfProfiled( + *cast(MergedCondition), DEBUG_TYPE); } void CHR::transformScopes(SmallVectorImpl &CHRScopes) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 345bc63081b81..83dee09f94b99 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4226,18 +4226,16 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Selects are only modelled in the legacy cost model for safe // divisors. case Instruction::Select: { - VPValue *VPV = VPI->getVPSingleValue(); - if (VPV->getNumUsers() == 1) { - if (auto *WR = dyn_cast(*VPV->user_begin())) { - switch (WR->getOpcode()) { - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - continue; - default: - break; - } + if (auto *WR = + dyn_cast_or_null(VPI->getSingleUser())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + continue; + default: + break; } } C += VPI->cost(VF, CostCtx); @@ -6976,11 +6974,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, // the more accurate VPlan-based cost model. for (VPRecipeBase &R : *Plan.getVectorPreheader()) { auto *VPI = dyn_cast(&R); - if (!VPI || VPI->getOpcode() != Instruction::Select || - VPI->getNumUsers() != 1) + if (!VPI || VPI->getOpcode() != Instruction::Select) continue; - if (auto *WR = dyn_cast(*VPI->user_begin())) { + if (auto *WR = dyn_cast_or_null(VPI->getSingleUser())) { switch (WR->getOpcode()) { case Instruction::UDiv: case Instruction::SDiv: @@ -8231,9 +8228,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, VPValue *BinOp = Reduction->getOperand(0); VPValue *Accumulator = Reduction->getOperand(1); - VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); - if (isa(BinOpRecipe) || - isa(BinOpRecipe)) + if (isa(BinOp) || isa(BinOp)) std::swap(BinOp, Accumulator); assert(ScaleFactor == @@ -8801,7 +8796,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // with fewer lanes than the VF. So the operands of the select would have // different numbers of lanes. Partial reductions mask the input instead. if (!PhiR->isInLoop() && CM.foldTailByMasking() && - !isa(OrigExitingVPV->getDefiningRecipe())) { + !isa(OrigExitingVPV)) { VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent()); std::optional FMFs = PhiTy->isFloatingPointTy() diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ffba0bdbdbe83..cc53b0dd3577e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16844,6 +16844,16 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( } return false; }; + auto CheckNonSchedulableOrdering = [&](const TreeEntry *E, + Instruction *InsertPt) { + return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() && + !TEUseEI.UserTE->isCopyableElement( + const_cast(TEInsertPt)) && + isUsedOutsideBlock(const_cast(TEInsertPt)) && + InsertPt->getNextNode() == TEInsertPt && + (!E->hasCopyableElements() || !E->isCopyableElement(InsertPt) || + !isUsedOutsideBlock(InsertPt)); + }; for (Value *V : VL) { if (isConstant(V) || !VisitedValue.insert(V).second) continue; @@ -16926,6 +16936,11 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // The node is reused - exit. if (CheckAndUseSameNode(TEPtr)) break; + // The parent node is copyable with last inst used outside? And the last + // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to + // preserve def-use chain. + if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt)) + continue; VToTEs.insert(TEPtr); } if (ArrayRef VTEs = getSplitTreeEntries(V); !VTEs.empty()) { @@ -16961,7 +16976,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( if (none_of(TE->CombinedEntriesWithIndices, [&](const auto &P) { return P.first == VTE->Idx; })) { Instruction &LastBundleInst = getLastInstructionInBundle(VTE); - if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) + if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) || + CheckNonSchedulableOrdering(VTE, &LastBundleInst)) continue; } // The node is reused - exit. @@ -21003,6 +21019,22 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, return isUsedOutsideBlock(V); })) return std::nullopt; + // If any instruction is used outside block only and its operand is placed + // immediately before it, do not schedule, it may cause wrong def-use chain. + if (S.areInstructionsWithCopyableElements() && any_of(VL, [&](Value *V) { + if (isa(V) || S.isCopyableElement(V)) + return false; + if (isUsedOutsideBlock(V)) { + for (Value *Op : cast(V)->operands()) { + auto *I = dyn_cast(Op); + if (!I) + continue; + return SLP->isVectorized(I) && I->getNextNode() == V; + } + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 90696ffc3aca7..62dacf912e210 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1428,7 +1428,7 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { void VPSlotTracker::assignName(const VPValue *V) { assert(!VPValue2Name.contains(V) && "VPValue already has a name!"); auto *UV = V->getUnderlyingValue(); - auto *VPI = dyn_cast_or_null(V->getDefiningRecipe()); + auto *VPI = dyn_cast_or_null(V); if (!UV && !(VPI && !VPI->getName().empty())) { VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str(); NextSlot++; diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 663e31a499b01..92ff0dcf67927 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -798,8 +798,8 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck( bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { - auto *MinMaxR = dyn_cast( - RedPhiR->getBackedgeValue()->getDefiningRecipe()); + auto *MinMaxR = + dyn_cast_or_null(RedPhiR->getBackedgeValue()); if (!MinMaxR) return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a5591feb3d05..beae8051e75dc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -356,7 +356,7 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, // recipe. auto HandleWiden = [&](VPWidenRecipe *Widen) { if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) { - Widen = dyn_cast(Op->getDefiningRecipe()); + Widen = dyn_cast(Op); } Opcode = Widen->getOpcode(); VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe(); @@ -381,11 +381,10 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0)); ExtAType = GetExtendKind(OpR); } else if (isa(OpR)) { - auto RedPhiOp1R = getOperand(1)->getDefiningRecipe(); - if (isa(RedPhiOp1R)) { + if (auto RedPhiOp1R = dyn_cast_or_null(getOperand(1))) { InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0)); ExtAType = GetExtendKind(RedPhiOp1R); - } else if (auto Widen = dyn_cast(RedPhiOp1R)) + } else if (auto Widen = dyn_cast_or_null(getOperand(1))) HandleWiden(Widen); } else if (auto Widen = dyn_cast(OpR)) { HandleWiden(Widen); @@ -3195,10 +3194,10 @@ bool VPReplicateRecipe::shouldPack() const { static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, const Loop *L) { auto *PtrR = Ptr->getDefiningRecipe(); - if (!PtrR || !((isa(PtrR) && - cast(PtrR)->getOpcode() == + if (!PtrR || !((isa(Ptr) && + cast(Ptr)->getOpcode() == Instruction::GetElementPtr) || - isa(PtrR) || + isa(Ptr) || match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1b9bc4cc45163..bb9eed0e0ddb9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -580,10 +580,13 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { // Check if R is a dead VPPhi <-> update cycle and remove it. auto *PhiR = dyn_cast(&R); - if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1) + if (!PhiR || PhiR->getNumOperands() != 2) + continue; + VPUser *PhiUser = PhiR->getSingleUser(); + if (!PhiUser) continue; VPValue *Incoming = PhiR->getOperand(1); - if (*PhiR->user_begin() != Incoming->getDefiningRecipe() || + if (PhiUser != Incoming->getDefiningRecipe() || Incoming->getNumUsers() != 1) continue; PhiR->replaceAllUsesWith(PhiR->getOperand(0)); @@ -1307,7 +1310,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { isa(X)) { auto *Phi = cast(X); if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) && - Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) { + Phi->getSingleUser() == Def) { Phi->setOperand(0, Y); Def->replaceAllUsesWith(Phi); return; @@ -1592,10 +1595,11 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, // Currently only handle cases where the single user is a header-mask // comparison with the backedge-taken-count. - if (!match(*WideIV->user_begin(), - m_ICmp(m_Specific(WideIV), - m_Broadcast( - m_Specific(Plan.getOrCreateBackedgeTakenCount()))))) + VPUser *SingleUser = WideIV->getSingleUser(); + if (!SingleUser || + !match(SingleUser, m_ICmp(m_Specific(WideIV), + m_Broadcast(m_Specific( + Plan.getOrCreateBackedgeTakenCount()))))) continue; // Update IV operands and comparison bound to use new narrower type. @@ -1607,7 +1611,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, auto *NewBTC = new VPWidenCastRecipe( Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy); Plan.getVectorPreheader()->appendRecipe(NewBTC); - auto *Cmp = cast(*WideIV->user_begin()); + auto *Cmp = cast(WideIV->getSingleUser()); Cmp->setOperand(1, NewBTC); MadeChange = true; @@ -3715,11 +3719,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { - auto *RecipeA = - dyn_cast_if_present(A->getDefiningRecipe()); - auto *RecipeB = - dyn_cast_if_present(B->getDefiningRecipe()); - auto *Mul = cast(VecOp->getDefiningRecipe()); + auto *RecipeA = dyn_cast_if_present(A); + auto *RecipeB = dyn_cast_if_present(B); + auto *Mul = cast(VecOp); // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const))) ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul); @@ -3744,10 +3746,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Match reduce.add(ext(mul(A, B))). if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) { - auto *Ext = cast(VecOp->getDefiningRecipe()); - auto *Mul = cast(Ext->getOperand(0)->getDefiningRecipe()); - auto *Ext0 = dyn_cast_if_present(A->getDefiningRecipe()); - auto *Ext1 = dyn_cast_if_present(B->getDefiningRecipe()); + auto *Ext = cast(VecOp); + auto *Mul = cast(Ext->getOperand(0)); + auto *Ext0 = dyn_cast_if_present(A); + auto *Ext1 = dyn_cast_if_present(B); // reduce.add(ext(mul(ext, const))) // -> reduce.add(ext(mul(ext, ext(const)))) @@ -4325,12 +4327,12 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Check if all values feeding InterleaveR are matching wide recipes, which // operands that can be narrowed. - auto *WideMember0 = dyn_cast_or_null( - InterleaveR->getStoredValues()[0]->getDefiningRecipe()); + auto *WideMember0 = + dyn_cast_or_null(InterleaveR->getStoredValues()[0]); if (!WideMember0) return; for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) { - auto *R = dyn_cast_or_null(V->getDefiningRecipe()); + auto *R = dyn_cast_or_null(V); if (!R || R->getOpcode() != WideMember0->getOpcode() || R->getNumOperands() > 2) return; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 5da74630ef626..09fdf5a731816 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -150,6 +150,13 @@ class LLVM_ABI_FOR_TEST VPValue { bool hasOneUse() const { return getNumUsers() == 1; } + /// Return the single user of this value, or nullptr if there is not exactly + /// one user. + VPUser *getSingleUser() { return hasOneUse() ? *user_begin() : nullptr; } + const VPUser *getSingleUser() const { + return hasOneUse() ? *user_begin() : nullptr; + } + void replaceAllUsesWith(VPValue *New); /// Go through the uses list for this VPValue and make each use point to \p diff --git a/llvm/test/CodeGen/AArch64/seh-extended-spills.ll b/llvm/test/CodeGen/AArch64/seh-extended-spills.ll index ecc22703ef584..54f8e3f4c5a64 100644 --- a/llvm/test/CodeGen/AArch64/seh-extended-spills.ll +++ b/llvm/test/CodeGen/AArch64/seh-extended-spills.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype asm -o - %s | FileCheck %s +; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype obj -o - %s | llvm-readobj -u - | FileCheck %s -check-prefix CHECK-UNWIND declare dso_local void @g(ptr noundef) define dso_local preserve_mostcc void @f(ptr noundef %p) #0 { @@ -12,23 +13,38 @@ entry: attributes #0 = { nounwind uwtable(sync) } -; CHECK: stp x9, x10, [sp, #[[OFFSET_0:[0-9]+]]] -; CHECK-NEXT: .seh_save_any_reg_p x9, [[OFFSET_0]] -; CHECK: stp x11, x12, [sp, #[[OFFSET_1:[0-9]+]]] -; CHECK-NEXT: .seh_save_any_reg_p x11, [[OFFSET_1]] -; CHECK: stp x13, x14, [sp, #[[OFFSET_2:[0-9]+]]] -; CHECK-NEXT: .seh_save_any_reg_p x13, [[OFFSET_2]] -; CHECK: str x15, [sp, #[[OFFSET_3:[0-9]+]]] -; CHECK-NEXT: .seh_save_any_reg x15, [[OFFSET_3]] +; CHECK: str x30, [sp, #16] +; CHECK-NEXT: .seh_save_reg x30, 16 +; CHECK: str x9, [sp, #24] +; CHECK-NEXT: .seh_save_any_reg x9, 24 +; CHECK: stp x10, x11, [sp, #32 +; CHECK-NEXT: .seh_save_any_reg_p x10, 32 +; CHECK: stp x12, x13, [sp, #48] +; CHECK-NEXT: .seh_save_any_reg_p x12, 48 +; CHECK: stp x14, x15, [sp, #64] +; CHECK-NEXT: .seh_save_any_reg_p x14, 64 ; CHECK: .seh_endprologue ; CHECK: .seh_startepilogue -; CHECK: ldr x15, [sp, #[[OFFSET_3]]] -; CHECK-NEXT: .seh_save_any_reg x15, [[OFFSET_3]] -; CHECK: ldp x13, x14, [sp, #[[OFFSET_2]]] -; CHECK-NEXT: .seh_save_any_reg_p x13, [[OFFSET_2]] -; CHECK: ldp x11, x12, [sp, #[[OFFSET_1]]] -; CHECK-NEXT: .seh_save_any_reg_p x11, [[OFFSET_1]] -; CHECK: ldp x9, x10, [sp, #[[OFFSET_0]]] -; CHECK-NEXT: .seh_save_any_reg_p x9, [[OFFSET_0]] +; CHECK: ldp x14, x15, [sp, #64] +; CHECK-NEXT: .seh_save_any_reg_p x14, 64 +; CHECK: ldp x12, x13, [sp, #48] +; CHECK-NEXT: .seh_save_any_reg_p x12, 48 +; CHECK: ldp x10, x11, [sp, #32 +; CHECK-NEXT: .seh_save_any_reg_p x10, 32 +; CHECK: ldr x9, [sp, #24] +; CHECK-NEXT: .seh_save_any_reg x9, 24 +; CHECK: ldr x30, [sp, #16] +; CHECK-NEXT: .seh_save_reg x30, 16 + ; CHECK: .seh_endepilogue + +; CHECK-UNWIND: Prologue [ +; CHECK-UNWIND: 0xe74e04 ; stp x14, x15, [sp, #64] +; CHECK-UNWIND: 0xe74c03 ; stp x12, x13, [sp, #48] +; CHECK-UNWIND: 0xe74a02 ; stp x10, x11, [sp, #32] +; CHECK-UNWIND: 0xe70903 ; str x9, [sp, #24] +; CHECK-UNWIND: 0xd2c2 ; str x30, [sp, #16] +; CHECK-UNWIND: 0x05 ; sub sp, #80 +; CHECK-UNWIND: 0xe4 ; end +; CHECK-UNWIND: ] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll new file mode 100644 index 0000000000000..cf9524b860fd2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value-addrspaces.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s + +@flat = external global i32, align 4 +@global = external addrspace(1) global i32, align 4 +@lds = addrspace(3) global i32 poison, align 4 +@constant = external addrspace(4) constant i32, align 4 +@buf = external addrspace(8) global i8 + +define ptr @global_value_as0_external() { +; GCN-LABEL: global_value_as0_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, flat@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, flat@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr @flat +} + +define ptr addrspace(1) @global_value_as1_external() { +; GCN-LABEL: global_value_as1_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, global@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, global@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr addrspace(1) @global +} + +define ptr addrspace(4) @global_value_as4_external() { +; GCN-LABEL: global_value_as4_external: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, constant@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, constant@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_setpc_b64 s[30:31] + ret ptr addrspace(4) @constant +} + +define amdgpu_kernel void @global_value_as3_lds_kernel(ptr addrspace(1) %out) { +; GCN-LABEL: global_value_as3_lds_kernel: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v0, v0, s[0:1] +; GCN-NEXT: s_endpgm + %addr = ptrtoint ptr addrspace(3) @lds to i32 + store i32 %addr, ptr addrspace(1) %out + ret void +} + +define void @global_value_as8_buffer_store(i32 %val) { +; GCN-LABEL: global_value_as8_buffer_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, buf@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, buf@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %val, ptr addrspace(8) @buf, i32 0, i32 0, i32 0) + ret void +} + +define i32 @global_value_as8_buffer_load(i32 %offset) { +; GCN-LABEL: global_value_as8_buffer_load: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, buf@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, buf@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %val = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) @buf, i32 %offset, i32 0, i32 0) + ret i32 %val +} + +declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index e5cd0710359ac..70ff92f8eda92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() { define i32 @asm_vgpr_early_clobber() { ; CHECK-LABEL: name: asm_vgpr_early_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] @@ -94,7 +94,7 @@ entry: define i32 @test_single_vgpr_output() nounwind { ; CHECK-LABEL: name: test_single_vgpr_output ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -106,7 +106,7 @@ entry: define i32 @test_single_sgpr_output_s32() nounwind { ; CHECK-LABEL: name: test_single_sgpr_output_s32 ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -119,7 +119,7 @@ entry: define float @test_multiple_register_outputs_same() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_same ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 1245194 /* regdef:VGPR_32 */, def %9 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]] @@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 { define double @test_multiple_register_outputs_mixed() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_mixed ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 3407882 /* regdef:VReg_64 */, def %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %8, 2818058 /* regdef:VReg_64 */, def %9 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %9 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) @@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42) ret void @@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32) - ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[COPY1]] ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42) ret void @@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 1245193 /* reguse:VGPR_32 */, [[COPY1]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind { ; CHECK-NEXT: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) + ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1245194 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32) - ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { define i32 @test_sgpr_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %10 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %10 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %12, 2424841 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %12, 1835017 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) + ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11, 1245194 /* regdef:VGPR_32 */, def %12, 1245194 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %13 @@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:SReg_32 */, def %8 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll index 82886ab9e7d55..e1ac8ba5e6db4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll @@ -1,4 +1,4 @@ -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s ; FIXME: Merge with DAG test @lds.external = external unnamed_addr addrspace(3) global [0 x i32] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll index cabb37c330b4a..3396eaedf359e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -1,8 +1,8 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s -; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel -new-reg-bank-select < %s 2>&1 | FileCheck %s ; CHECK: error: lds: unsupported initializer for address space diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 74153b904bf3a..7d141a0e8720e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -8,17 +8,16 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_no_zext: @@ -26,19 +25,17 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1] -; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[0:1] +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v5, v7 -; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7] +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[2:3] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dword v4, v3, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] +; GFX10-NEXT: global_load_dword v4, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v3, v4, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src1: @@ -80,17 +75,17 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3] -; GFX11-NEXT: global_load_b32 v5, v2, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3] +; GFX11-NEXT: global_load_b32 v5, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -110,18 +105,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] +; GFX10-NEXT: global_load_dword v4, v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src0: @@ -135,14 +128,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -209,18 +202,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dword v4, v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_masked_src0_hi: @@ -234,14 +225,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -389,22 +380,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xfff00000, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v5, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2] +; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v2 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_partially_masked_src0: @@ -414,24 +403,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] +; GFX11-NEXT: global_load_b64 v[1:2], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0 +; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1] -; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v3, 0 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2] +; GFX11-NEXT: v_and_b32_e32 v4, 0xf00f, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -536,28 +523,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3] -; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[5:6], v0, s[4:5] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] +; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[3:4] ; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2] -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX11-NEXT: .LBB10_2: ; %Flow ; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 +; GFX11-NEXT: v_mul_lo_u32 v1, v3, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: .LBB10_4: ; %endif ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 7f10ee4c17450..3eecaccf0308f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -741,12 +741,13 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v7, v1 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] -; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 -; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9] -; GCN-NEXT: v_mov_b32_e32 v2, v10 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v4, v[0:1] +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v9, v[10:11] +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v4, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v9, v[10:11] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i96: @@ -754,26 +755,26 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9] -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9] +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v7, v4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v8, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v2, v8, v[9:10] +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i96: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v9, v3 ; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9] -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v7, v4, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v9, 0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v9, v[10:11] +; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v4, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v9, v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i96: @@ -784,16 +785,16 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0 +; GFX12-NEXT: v_mov_b32_e32 v8, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] -; GFX12-NEXT: v_mov_b32_e32 v2, v8 +; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v7, v4, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v8, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v2, v8, v[9:10] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9] +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v6, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i96: @@ -1072,18 +1073,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX7-NEXT: v_mov_b32_e32 v10, v2 -; GFX7-NEXT: v_mov_b32_e32 v11, v3 -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v12, v4 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v11, v3 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX7-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX7-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1095,18 +1095,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v11, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v12, v4 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v11, v3 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX8-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX8-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1118,18 +1117,17 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v11, v3 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14] +; GFX9-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v6, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1140,19 +1138,19 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_mov_b32_e32 v8, v0 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0 ; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s4, v9, v5, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12] -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v4, v[12:13] +; GFX10-NEXT: v_mad_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[12:13] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[3:4] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v11, v4, v[5:6] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i128: @@ -1162,16 +1160,15 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4 ; GFX11-NEXT: v_mov_b32_e32 v12, v3 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 -; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6 +; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v9, v5, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v2, v13 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[13:14] +; GFX11-NEXT: v_mad_u64_u32 v[13:14], vcc_lo, v8, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[13:14] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4] ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1184,28 +1181,26 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 -; GFX12-NEXT: v_mov_b32_e32 v10, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 ; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], null, v9, v5, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v4, v[12:13] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v2, v13 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[12:13] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6] +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[3:4] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v11, v4, v[5:6] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i128: @@ -2409,216 +2404,204 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc -; GFX7-NEXT: v_mov_b32_e32 v21, v22 -; GFX7-NEXT: v_mov_b32_e32 v22, v23 -; GFX7-NEXT: v_mov_b32_e32 v23, v18 -; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX7-NEXT: v_mov_b32_e32 v20, v23 -; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_mov_b32_e32 v12, v22 -; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX7-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc +; GFX7-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX7-NEXT: v_mov_b32_e32 v22, v26 +; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc +; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX7-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, v10 -; GFX7-NEXT: v_mov_b32_e32 v1, v13 -; GFX7-NEXT: v_mov_b32_e32 v2, v14 -; GFX7-NEXT: v_mov_b32_e32 v7, v11 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, v16 +; GFX7-NEXT: v_mov_b32_e32 v1, v11 +; GFX7-NEXT: v_mov_b32_e32 v2, v12 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc -; GFX8-NEXT: v_mov_b32_e32 v21, v22 -; GFX8-NEXT: v_mov_b32_e32 v22, v23 -; GFX8-NEXT: v_mov_b32_e32 v23, v18 -; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX8-NEXT: v_mov_b32_e32 v20, v23 -; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v12, v22 -; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc +; GFX8-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX8-NEXT: v_mov_b32_e32 v22, v26 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc +; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX8-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v10 -; GFX8-NEXT: v_mov_b32_e32 v1, v13 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-NEXT: v_mov_b32_e32 v7, v11 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v16 +; GFX8-NEXT: v_mov_b32_e32 v1, v11 +; GFX8-NEXT: v_mov_b32_e32 v2, v12 +; GFX8-NEXT: v_mov_b32_e32 v7, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12 -; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] -; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc -; GFX9-NEXT: v_mov_b32_e32 v21, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v18 -; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] -; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] -; GFX9-NEXT: v_mov_b32_e32 v20, v23 -; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] -; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v12, v22 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11] -; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v20, vcc +; GFX9-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] +; GFX9-NEXT: v_mov_b32_e32 v22, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v16, vcc +; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 +; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX9-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] +; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], 0, v2, s[12:13] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v10 -; GFX9-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v4, v21, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v28, v22, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v13, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v23, v14, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v27, v0, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v10, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v30, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v29, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v16 +; GFX9-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2626,69 +2609,67 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 -; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11 -; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12 -; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v18, v2 +; GFX10-NEXT: v_mov_b32_e32 v19, v3 +; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1] +; GFX10-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v17, v13, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v18, v12, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v19, v11, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v11, v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, v4, v10, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] ; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0 -; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v18, v23 -; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX10-NEXT: v_mov_b32_e32 v19, v24 -; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15 -; GFX10-NEXT: v_mov_b32_e32 v18, v21 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[25:26], s4, v6, v8, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[2:3] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] +; GFX10-NEXT: v_mov_b32_e32 v23, v25 +; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] +; GFX10-NEXT: v_add_co_ci_u32_e64 v29, s4, 0, v20, s4 +; GFX10-NEXT: v_mov_b32_e32 v20, v3 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v17, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[24:25], s6, v16, v11, v[20:21] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22] -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6 -; GFX10-NEXT: v_mov_b32_e32 v14, v20 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22] -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8 -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s5, v18, v11, v[22:23] +; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX10-NEXT: v_mul_lo_u32 v23, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v17, v10, v[24:25] +; GFX10-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s6, 0, v3, s6 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s7, v19, v10, v[20:21] +; GFX10-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s6, v18, v9, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v3, s6 +; GFX10-NEXT: v_mad_u64_u32 v[13:14], s6, v4, v9, v[11:12] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v16, v9, v[1:2] +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[9:10], s8, v19, v8, v[20:21] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s8, 0, v15, s8 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], s8, v5, v8, v[13:14] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[3:4] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v16, v9, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v10, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v15, v11, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v6, v12, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v26, v22, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v23, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v30, s5 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v28, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v27, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2696,69 +2677,68 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7 -; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v8 :: v_dual_mov_b32 v21, v7 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15 -; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14 -; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12 -; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX11-NEXT: v_mul_lo_u32 v31, v17, v14 +; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX11-NEXT: v_mul_lo_u32 v15, v16, v15 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v17, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v18, v12, v[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v11, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v11, v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v10, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v18, v10, v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] -; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8] -; GFX11-NEXT: v_mov_b32_e32 v7, v23 -; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 -; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX11-NEXT: v_mov_b32_e32 v8, v24 -; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1] -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8] -; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22] -; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2 -; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12] -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22] -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7] -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[25:26], vcc_lo, v19, v9, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v6, v20, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[2:3] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v4, v20, v[25:26] +; GFX11-NEXT: v_mov_b32_e32 v25, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v18, v20, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v6, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v16, v13, v[24:25] +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v22, s0 +; GFX11-NEXT: v_mov_b32_e32 v22, v3 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s0, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], s2, v16, v11, v[22:23] +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v20, 0 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], s1, v18, v11, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[24:25] +; GFX11-NEXT: v_mul_lo_u32 v24, v19, v12 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s3, v19, v10, v[22:23] +; GFX11-NEXT: v_mul_lo_u32 v22, v18, v13 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], s2, v18, v9, v[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v3, s2 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[11:12] +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v16, v9, v[1:2] +; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], s4, v19, v20, v[13:14] +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v18, s4 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s4, v5, v20, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v20, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v13, v11, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v12, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v8, v15, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v22, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10] +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v27, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v21, v20, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i256: @@ -2769,103 +2749,99 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12 -; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 +; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] +; GFX12-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v17, v13, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v18, v12, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v12, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v19, v11, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v11, v[2:3] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21] -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v10, v[20:21] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v18, v10, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v16, v10, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v19, v9, v[20:21] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[25:26], null, v6, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v22, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mov_b32_e32 v18, v23 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v4, v8, v[23:24] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 -; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9 -; GFX12-NEXT: v_mov_b32_e32 v19, v24 -; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10 -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19] -; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15 -; GFX12-NEXT: v_mov_b32_e32 v18, v21 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v23, v25 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22] -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 +; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v20, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v20, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v16, v13, v[22:23] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], s2, v16, v11, v[20:21] +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], s0, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s1, v18, v11, v[22:23] +; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX12-NEXT: v_mul_lo_u32 v23, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v10, v[24:25] +; GFX12-NEXT: v_mul_lo_u32 v24, v19, v12 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12] +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[20:21] +; GFX12-NEXT: v_mul_lo_u32 v25, v18, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], s2, v18, v9, v[14:15] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22] +; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], s2, v4, v9, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v16, v9, v[1:2] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19] +; GFX12-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], s4, v19, v8, v[20:21] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s4, v5, v8, v[13:14] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[3:4] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v9, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v10, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v15, v11, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v6, v12, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v26, v22, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s4 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s3 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s0 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3155,8 +3131,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6] ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; @@ -3167,8 +3143,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; @@ -3179,8 +3155,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3200,8 +3176,8 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v5, v3 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6] ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir index 137488f24a331..7ca3869b535e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir @@ -24,7 +24,7 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5(s32) + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5(s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32) @@ -33,7 +33,7 @@ body: | %2:vgpr(s32) = COPY %1(s32) %3:vgpr(s32) = G_FMUL %0, %2 %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5:vgpr_32 + INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %5:vgpr_32 %6:vgpr(s32) = COPY %4(s32) %7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32) $vgpr0 = COPY %7(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir index a50c7fe0748b8..fc86dd884fac0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-block-addr.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass=regbankselect %s -o - | FileCheck %s +# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s --- | diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 2843f72353db1..b7c84f1389197 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -31,102 +31,100 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v8, v6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8] +; CHECK-NEXT: v_mul_lo_u32 v7, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9] +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v3, v10 +; CHECK-NEXT: v_mul_lo_u32 v11, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v9, v12, v10 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9 -; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 -; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v7, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v3, 0 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v12, v[7:8] +; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CHECK-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v3, v[8:9] +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v4, v13 +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, v10 +; CHECK-NEXT: v_xor_b32_e32 v11, v5, v13 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v12, v10 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v10 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v10 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v9, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc -; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v6, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v12, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v7 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] @@ -136,8 +134,8 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 @@ -150,9 +148,9 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -218,10 +216,10 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_subb_u32 s5, 0, s11 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] ; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -292,11 +290,11 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, s13, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] @@ -379,266 +377,260 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc ; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GISEL-NEXT: v_xor_b32_e32 v9, v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v9 ; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v5 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v5 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v18, v0 ; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13] -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v15, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v15, v[11:12] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v18, v13 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v4, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v7, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v6, v4 +; GISEL-NEXT: v_xor_b32_e32 v6, v17, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v17, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v18, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; GISEL-NEXT: v_mac_f32_e32 v17, 0x4f800000, v18 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15 -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15 -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v1 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v12, v[8:9] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 ; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v17, v3, v15 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4] +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v8, v17, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v13, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v12, v[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v17, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -667,28 +659,28 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v5, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] -; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v13, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 -; CGP-NEXT: v_mul_lo_u32 v18, v17, v4 +; CGP-NEXT: v_trunc_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13] +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v18, v14, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 @@ -696,44 +688,44 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v5, v15 -; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 -; CGP-NEXT: v_mul_lo_u32 v11, v14, v4 -; CGP-NEXT: v_xor_b32_e32 v13, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v10, v14, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v3 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, v[12:13] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v16 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v16 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v4 +; CGP-NEXT: v_xor_b32_e32 v13, v10, v16 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v14, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v15, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v10, v12, v4 ; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 @@ -751,12 +743,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v4 ; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5] +; CGP-NEXT: v_add_i32_e32 v15, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v15, v[4:5] ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11] ; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc @@ -771,7 +763,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 @@ -785,8 +777,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v15, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v16, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -840,28 +832,28 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v7, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v16, v15, v6 +; CGP-NEXT: v_trunc_f32_e32 v6, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v7, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 @@ -869,53 +861,53 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v9, v12, v6 -; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v13, v[10:11] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v14 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v15, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v12, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v14, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v14, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 @@ -924,16 +916,16 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v6 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v15, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7] +; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 ; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -944,7 +936,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 @@ -958,8 +950,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v13, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -1054,10 +1046,10 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1133,11 +1125,11 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v5, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1186,155 +1178,152 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v16 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v15 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v16 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 -; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v15, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v18 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v17 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v20, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v16 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v18, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v18, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v20, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v20, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v9, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1356,62 +1345,61 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v9, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1424,178 +1412,175 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v1, v19, v1 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v1, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc +; CGP-NEXT: v_add_i32_e32 v18, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v18 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -1610,11 +1595,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CGP-NEXT: v_mul_hi_u32 v6, v8, v2 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v6, v5 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -1640,10 +1625,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -1679,28 +1664,28 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CHECK-NEXT: v_trunc_f32_e32 v7, v6 -; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v12, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 @@ -1708,53 +1693,53 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5 -; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 -; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5 -; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9 -; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 -; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v13, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v12, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v13 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v9 +; CHECK-NEXT: v_xor_b32_e32 v14, v4, v13 +; CHECK-NEXT: v_mul_hi_u32 v4, v12, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v14, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 ; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v14, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v14, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 @@ -1763,16 +1748,16 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v4 ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v6, v5 ; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc -; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] @@ -1797,7 +1782,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_xor_b32_e32 v3, v13, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 @@ -1839,274 +1824,268 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v9, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v10, 0 -; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4 -; GISEL-NEXT: v_lshl_b64 v[9:10], v[9:10], v6 +; GISEL-NEXT: v_mov_b32_e32 v12, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v13, 0 +; GISEL-NEXT: v_lshl_b64 v[7:8], v[12:13], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 -; GISEL-NEXT: v_trunc_f32_e32 v13, v11 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v11, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v19, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v5, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v15, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v11, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v19, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v10 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], v6 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v5, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v16 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 +; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v1, v9, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v14, v0, v9 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v16 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v16 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13] -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v15, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v16, 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v10, v5 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v17, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v10, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v13, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v6, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v12, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v14 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v7, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 +; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15 -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 -; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v14, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v1 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v13, v[7:8] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v9 ; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v11 +; GISEL-NEXT: v_xor_b32_e32 v16, v3, v9 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v11, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v16, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v14, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v13, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v13 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2138,28 +2117,28 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 -; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v18, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v19, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 -; CGP-NEXT: v_trunc_f32_e32 v12, v11 -; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] -; CGP-NEXT: v_mul_lo_u32 v11, v19, v10 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; CGP-NEXT: v_mul_hi_u32 v12, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v16, v14 -; CGP-NEXT: v_mul_lo_u32 v15, v19, v14 +; CGP-NEXT: v_trunc_f32_e32 v11, v11 +; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v11 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v17, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v12, v17, v14 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 @@ -2167,53 +2146,53 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v19, v14 +; CGP-NEXT: v_mul_hi_u32 v13, v16, v14 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 -; CGP-NEXT: v_mul_lo_u32 v8, v19, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v17, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v18, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v18, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v19, v17, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v18, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v18 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v17, v14 +; CGP-NEXT: v_xor_b32_e32 v19, v9, v18 +; CGP-NEXT: v_mul_hi_u32 v9, v17, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v19, v14 +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v14 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v19, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v18, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v17, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v19, v8 ; CGP-NEXT: v_mul_lo_u32 v11, v15, v9 ; CGP-NEXT: v_mul_hi_u32 v12, v15, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v18, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v19, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v18, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v19, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v15, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 @@ -2222,16 +2201,16 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v12, v18, v9 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v19, v9 ; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v10 ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 ; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11] -; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc -; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12 +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v19, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v19, v12 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 ; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] @@ -2256,7 +2235,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v17, v0 +; CGP-NEXT: v_xor_b32_e32 v8, v18, v0 ; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 @@ -2313,102 +2292,100 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v8 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v16, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v16, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 -; CGP-NEXT: v_mul_lo_u32 v5, v16, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v6, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, v[9:10] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v6, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v15 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v6, v12 +; CGP-NEXT: v_xor_b32_e32 v13, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v16, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v16, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v15, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v14, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v13, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -2418,8 +2395,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v14, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 @@ -2432,9 +2409,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v15, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -2538,28 +2515,29 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 @@ -2567,168 +2545,163 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v9, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v3 -; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6] -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v12 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v7 -; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 -; GISEL-NEXT: v_trunc_f32_e32 v8, v6 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v2 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[8:9] +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v10 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v0, v[5:6] +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v7 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 ; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 ; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v8 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 -; GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 +; GISEL-NEXT: v_sub_i32_e64 v17, s[4:5], 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v2, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v17, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v6, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v17, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v17, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v2 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3] ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v14, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[7:8] ; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v5 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v9 ; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v5, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0 -; GISEL-NEXT: v_mul_hi_u32 v14, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v13, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v11, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v9 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -2736,8 +2709,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2748,8 +2721,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index f4489c2239fda..9d6ffc9bbc0dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -172,11 +172,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: s_subb_u32 s15, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -247,11 +247,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] @@ -333,11 +333,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -400,20 +400,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v7, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v7, v[2:3] ; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4 @@ -421,19 +421,19 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v7 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc @@ -442,15 +442,15 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc @@ -554,29 +554,29 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 ; GFX10-NEXT: v_add_co_u32 v0, s8, v5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s8, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 -; GFX10-NEXT: v_add_co_u32 v5, s8, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 +; GFX10-NEXT: v_add_co_u32 v6, s8, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v6, 0 +; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2] -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v6, v[1:2] +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 @@ -590,16 +590,16 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0 ; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3 @@ -1308,11 +1308,11 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_subb_u32 s17, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1386,163 +1386,162 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_ashr_i32 s10, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6 -; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v9 +; GFX8-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s8, v3 +; GFX8-NEXT: s_ashr_i32 s8, s3, 31 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s18, s6 -; GFX8-NEXT: s_addc_u32 s1, s19, s6 -; GFX8-NEXT: s_add_u32 s2, s2, s10 -; GFX8-NEXT: s_mov_b32 s11, s10 -; GFX8-NEXT: s_addc_u32 s3, s3, s10 -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4 -; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5 +; GFX8-NEXT: s_add_u32 s10, s18, s6 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: s_addc_u32 s11, s19, s6 +; GFX8-NEXT: s_add_u32 s0, s2, s8 +; GFX8-NEXT: s_mov_b32 s9, s8 +; GFX8-NEXT: s_addc_u32 s1, s3, s8 +; GFX8-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] -; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v11, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v11 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] ; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc ; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v1, v12, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, v11, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v12, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6 -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] -; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7 -; GFX8-NEXT: v_mov_b32_e32 v6, s17 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v12, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] +; GFX8-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX8-NEXT: v_xor_b32_e32 v1, s17, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, s17 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v6 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] ; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9 -; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_lo_u32 v7, v10, v6 ; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2 ; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5 +; GFX8-NEXT: v_mul_lo_u32 v9, v11, v6 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, v10, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5 +; GFX8-NEXT: v_mul_hi_u32 v6, v11, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v11, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v6 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7 -; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v6 +; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3 -; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v6, v7 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v12, s9 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 +; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s10, v2 ; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc -; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8 +; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s11, v8 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 @@ -1575,7 +1574,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 ; GFX8-NEXT: v_mov_b32_e32 v7, s1 @@ -1619,11 +1618,11 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_subb_u32 s17, 0, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1703,152 +1702,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_ashr_i32 s10, s3, 31 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s18, s6 -; GFX9-NEXT: s_addc_u32 s1, s19, s6 -; GFX9-NEXT: s_add_u32 s2, s2, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s3, s3, s10 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v3 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v5 +; GFX9-NEXT: s_add_u32 s10, s18, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: s_addc_u32 s11, s19, s6 +; GFX9-NEXT: s_add_u32 s0, s2, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s3, s8 +; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3] -; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v1, v12, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v11, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v11, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v11, v4 +; GFX9-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, s17 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v12, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v0 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] +; GFX9-NEXT: v_xor_b32_e32 v1, s17, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, s17 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v4, v11, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s4, v9 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, v10, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9 -; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 +; GFX9-NEXT: v_xor_b32_e32 v5, s4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v11, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v11, v6 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 -; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v2 ; GFX9-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v6 +; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v8, v3 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_add3_u32 v11, v7, v9, v6 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v12, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8 +; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] @@ -1880,7 +1878,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 ; GFX9-NEXT: v_mov_b32_e32 v7, s1 @@ -1917,21 +1915,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_subb_u32 s20, 0, s7 ; GFX10-NEXT: s_xor_b64 s[16:17], s[4:5], s[8:9] ; GFX10-NEXT: s_ashr_i32 s8, s19, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_ashr_i32 s10, s3, 31 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s18, s18, s8 ; GFX10-NEXT: s_addc_u32 s19, s19, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: s_add_u32 s2, s2, s10 ; GFX10-NEXT: s_mov_b32 s11, s10 ; GFX10-NEXT: s_addc_u32 s3, s3, s10 -; GFX10-NEXT: s_mov_b32 s9, s8 -; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_mov_b32 s9, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1940,256 +1938,253 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_trunc_f32_e32 v6, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v6 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v7, 0 +; GFX10-NEXT: v_trunc_f32_e32 v5, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v5 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v6, 0 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX10-NEXT: s_sub_u32 s5, 0, s2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GFX10-NEXT: v_mul_hi_u32 v10, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s22, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2] -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s5, s21, v7, v[1:2] +; GFX10-NEXT: s_sub_u32 s5, 0, s2 +; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s5, v8, 0 ; GFX10-NEXT: s_subb_u32 s22, 0, s3 -; GFX10-NEXT: v_mul_hi_u32 v12, v8, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, v5, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX10-NEXT: v_mul_lo_u32 v13, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v14, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v15, v7, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1] -; GFX10-NEXT: v_mul_hi_u32 v1, v9, v3 -; GFX10-NEXT: v_add_co_u32 v3, s23, v4, v13 +; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v6, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s23, s5, v9, v[2:3] +; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3 +; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 +; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1 +; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[4:5] +; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 +; GFX10-NEXT: v_add_co_u32 v3, s23, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v10, s23, v14, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23 -; GFX10-NEXT: v_mul_lo_u32 v14, v8, v0 -; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v6 +; GFX10-NEXT: v_add_co_u32 v10, s23, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 +; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0 +; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v6, s23, v10, v15 -; GFX10-NEXT: v_mul_lo_u32 v15, v5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0 +; GFX10-NEXT: v_add_co_u32 v10, s23, v10, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 ; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v17, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v17, v9, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v4, v3 -; GFX10-NEXT: v_add_co_u32 v4, s23, v11, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v13, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v15, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v0, s23, v6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v4, s23, v4, v12 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v16 -; GFX10-NEXT: v_add3_u32 v1, v3, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v7, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX10-NEXT: v_add_co_u32 v11, s23, v15, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, s23, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v0 +; GFX10-NEXT: v_add3_u32 v1, v3, v10, v1 +; GFX10-NEXT: v_add_co_u32 v5, s23, v11, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v1, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s21, v6, 0 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_add_co_u32 v2, s23, v5, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, v7, v0 -; GFX10-NEXT: v_add3_u32 v3, v4, v3, v17 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s23, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, v9, v2 -; GFX10-NEXT: v_mul_hi_u32 v13, v8, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v7, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, v6, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v9, v2 -; GFX10-NEXT: v_mul_lo_u32 v14, v6, v3 +; GFX10-NEXT: v_mul_lo_u32 v10, v7, v0 +; GFX10-NEXT: v_add3_u32 v5, v3, v4, v17 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s21, s21, v7, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s21, s5, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v5, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v12, v7, v0 +; GFX10-NEXT: v_mul_hi_u32 v11, v6, v0 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v1 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s5, s5, v9, v[2:3] +; GFX10-NEXT: v_mul_hi_u32 v5, v8, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, v6, v3 ; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 +; GFX10-NEXT: v_mul_lo_u32 v2, v9, v1 ; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 -; GFX10-NEXT: v_add_co_u32 v3, s5, v4, v14 +; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v5 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0 +; GFX10-NEXT: v_add_co_u32 v10, s5, v15, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v13, v8, v0 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v5, s5, v11, v16 -; GFX10-NEXT: v_mul_lo_u32 v16, v9, v0 +; GFX10-NEXT: v_mul_lo_u32 v15, v9, v0 +; GFX10-NEXT: v_add_co_u32 v10, s5, v10, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v14, v11 -; GFX10-NEXT: v_add_co_u32 v11, s5, v12, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v5, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v11, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v17 -; GFX10-NEXT: v_add3_u32 v1, v4, v5, v1 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 +; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v13 +; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v12, s5, v15, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v10, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 +; GFX10-NEXT: v_add3_u32 v1, v4, v10, v1 +; GFX10-NEXT: v_add_co_u32 v5, s5, v12, v16 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v11, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 ; GFX10-NEXT: v_mul_lo_u32 v6, s1, v3 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v14, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_mul_lo_u32 v11, s0, v1 +; GFX10-NEXT: v_add_co_u32 v2, s5, v5, v2 +; GFX10-NEXT: v_mul_lo_u32 v10, s0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v13, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v7, s0, v3 ; GFX10-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, s1, v1 -; GFX10-NEXT: v_add3_u32 v0, v5, v4, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s1, v1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 +; GFX10-NEXT: v_add3_u32 v0, v4, v5, v0 ; GFX10-NEXT: v_mul_hi_u32 v4, s0, v1 ; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v11 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v12, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v11, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v7 -; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v12, s18, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 ; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v4 +; GFX10-NEXT: v_mul_lo_u32 v7, s18, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 ; GFX10-NEXT: v_mul_hi_u32 v9, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s19, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, s5, v0, v12 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v12, s5, v3, v1 -; GFX10-NEXT: v_add_co_u32 v2, s20, v7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v12, 0 -; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v9, s5, v2, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 -; GFX10-NEXT: v_add3_u32 v4, v4, v7, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v6 -; GFX10-NEXT: v_mul_hi_u32 v5, s19, v8 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v6, s19, v8 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v1 +; GFX10-NEXT: v_add_co_u32 v7, s20, v0, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v10, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v3, 0 +; GFX10-NEXT: v_mul_hi_u32 v11, s18, v8 +; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 +; GFX10-NEXT: v_add3_u32 v4, v4, v12, v5 +; GFX10-NEXT: v_add_co_u32 v2, s5, v7, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s20 +; GFX10-NEXT: v_mul_hi_u32 v7, s19, v8 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2] -; GFX10-NEXT: v_add_co_u32 v6, s5, v9, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v7, 1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2] -; GFX10-NEXT: v_add3_u32 v5, v3, v9, v5 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, s2, v6, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, s1, v1 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v14, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v10, v5 +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v3, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v4, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v8, v13, v2 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v3, v[1:2] +; GFX10-NEXT: v_add_co_u32 v5, s5, v6, v5 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v13, s0, s1, v1, vcc_lo +; GFX10-NEXT: v_add3_u32 v7, v8, v2, v7 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v12, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v17, v14, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v19, v18, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v20, v17, s0 -; GFX10-NEXT: v_sub_co_u32 v1, s0, v3, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v14, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v8 -; GFX10-NEXT: v_xor_b32_e32 v1, s16, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_xor_b32_e32 v4, s17, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 -; GFX10-NEXT: v_xor_b32_e32 v3, s4, v3 -; GFX10-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v19, v18, s0 +; GFX10-NEXT: v_add_co_u32 v18, s0, v10, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v19, s0, 0, v11, s0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s2, v7, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX10-NEXT: v_sub_co_u32 v2, s0, v15, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v2, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s3, v5, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v6, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, s18, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s19, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s19, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v0, s16, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 +; GFX10-NEXT: v_xor_b32_e32 v2, s17, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v2, s2 +; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v1, s16 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v8 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s16 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v2, s4, v6 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v5, s0 +; GFX10-NEXT: v_add_co_u32 v15, s0, v5, 1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v7, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 ; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_sub_co_u32 v9, s0, v13, s2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v5, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v11, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v11, s4, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v5, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s0 ; GFX10-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 -; GFX10-NEXT: v_xor_b32_e32 v3, s0, v6 -; GFX10-NEXT: v_xor_b32_e32 v6, s1, v11 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v7, s8, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s8, v8 -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[14:15] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s4 +; GFX10-NEXT: v_xor_b32_e32 v2, s0, v12 +; GFX10-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX10-NEXT: v_xor_b32_e32 v8, s8, v3 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v11, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v10, s8, v6 +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v10, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 6f42239cd191d..39cf7b01fd6c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -31,28 +31,28 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v6, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 @@ -60,53 +60,53 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc -; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11 -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11 -; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v8 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8 +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v10, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v13, v2 ; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 ; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 @@ -115,16 +115,16 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v6 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -149,10 +149,10 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -212,10 +212,10 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_subb_u32 s5, 0, s9 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] ; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -286,11 +286,11 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, s11, v1 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v3, v2 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2] ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 @@ -372,212 +372,209 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v11, v9 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v4, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v9 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v4, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v18, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v15, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v18, v[11:12] +; GISEL-NEXT: v_xor_b32_e32 v14, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v13 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v10, v9 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2] -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10] -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v12, v9 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v13, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v14, v0 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v16, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v0, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1 +; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v1 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v15, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v14, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v10 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 ; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 @@ -585,48 +582,47 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v0, v[3:4] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v10, v[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -651,28 +647,28 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v12 +; CGP-NEXT: v_trunc_f32_e32 v3, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v3, v14, v2 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v4, v15, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v12 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v12 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 @@ -680,53 +676,53 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v3, v15 -; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 -; CGP-NEXT: v_xor_b32_e32 v16, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v2 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v15, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[3:4] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v16 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v15, v[4:5] +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v3, v16 +; CGP-NEXT: v_mul_lo_u32 v3, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v12 +; CGP-NEXT: v_xor_b32_e32 v17, v4, v16 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v17, v12 +; CGP-NEXT: v_mul_lo_u32 v4, v14, v12 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v16, v2 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v17, v2 ; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 ; CGP-NEXT: v_mul_hi_u32 v10, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v16, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v16, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v17, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_mul_hi_u32 v5, v13, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 @@ -735,16 +731,16 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v3 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v5, v4 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v10 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -769,10 +765,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v15 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v15 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 @@ -820,28 +816,28 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v6, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v15, v10 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v5, v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v6, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 @@ -849,53 +845,53 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 -; CGP-NEXT: v_xor_b32_e32 v14, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v14 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[6:7] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v10 +; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v15, v10 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v10 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v15, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v11, v5 ; CGP-NEXT: v_mul_hi_u32 v8, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v14, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v15, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 @@ -904,16 +900,16 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8 +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v15, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v15, v8 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -938,10 +934,10 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v14 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -982,10 +978,10 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1061,11 +1057,11 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1112,153 +1108,150 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 -; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1280,37 +1273,36 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1330,10 +1322,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1346,176 +1338,173 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 ; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -1530,11 +1519,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -1558,10 +1547,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -1578,10 +1567,10 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] ; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 @@ -1657,11 +1646,11 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v3 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc @@ -1708,153 +1697,150 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 -; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 +; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 ; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1876,37 +1862,36 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1926,10 +1911,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1942,176 +1927,173 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v8, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v5, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 -; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] +; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 -; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 +; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 +; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 ; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] +; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] -; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -2126,11 +2108,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc @@ -2154,10 +2136,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -2193,102 +2175,100 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v7, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v2, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11 -; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc -; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11 -; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 -; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11 -; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v2, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v2, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v8, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v2, v9 +; CHECK-NEXT: v_xor_b32_e32 v10, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v2, v9 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v2, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v5, v4 ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v9, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v6 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -2313,10 +2293,10 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v12 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v12 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2351,224 +2331,220 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v8, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v9, 0 -; GISEL-NEXT: v_lshl_b64 v[4:5], v[8:9], v4 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_lshl_b64 v[4:5], v[10:11], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v5, v7, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v5, v7, vcc ; GISEL-NEXT: v_xor_b32_e32 v5, v4, v7 -; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v8, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v7 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v4, 0 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v8 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v4, v8 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v19, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v19, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v16, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v17, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13 +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 -; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 -; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11] -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v13, v10 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0 -; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v7, v14, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v17, v12, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v6, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v0, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v10, v1 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v1, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v0, v7, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v1, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v23 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v1 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v16, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v15, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v11 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 @@ -2577,26 +2553,25 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v0, v[3:4] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 @@ -2611,13 +2586,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2645,103 +2620,100 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v12, v10 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v11, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v15, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v15, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v16, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v16, v14 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v14, v4, v16 -; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v15, v13 -; CGP-NEXT: v_xor_b32_e32 v17, v8, v16 -; CGP-NEXT: v_mul_hi_u32 v8, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v15, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v14 +; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v18, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v17, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v14, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v18, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v18, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v17, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v18, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v17, v8 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v11, v18, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v4, 0 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v11, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v12, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v4, v[10:11] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v15, v8 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v18, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v18, v12 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 @@ -2766,10 +2738,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v17 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v17 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v17 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v17, vcc ; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 @@ -2827,76 +2799,74 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v9, v6, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v12, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v6, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v4, v12 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v12 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v6, v[9:10] ; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v4, v14 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v12 +; CGP-NEXT: v_xor_b32_e32 v11, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v7, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v6, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v12 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v11, v4 ; CGP-NEXT: v_mul_lo_u32 v7, v10, v5 ; CGP-NEXT: v_mul_hi_u32 v8, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v11, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 @@ -2904,17 +2874,17 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v7, v6 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v12, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v11, v8 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -3036,29 +3006,30 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 @@ -3066,165 +3037,160 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v0, v13, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v10, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v13, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1] -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v9 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6] -; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v3 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, 0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v10, 0 +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v9 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[5:6] +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v10, v[6:7] +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v11, 0 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, v[6:7] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v11, v[9:10] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v6 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v4, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v8, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v6 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v4 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v6, v5 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v4 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v12, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v11, 0 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v11, v[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v10, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, 0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v11, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 23ef596c021c2..c50b491bcb074 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -129,11 +129,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -203,11 +203,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] @@ -268,11 +268,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -468,31 +468,31 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s17, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 +; GFX10-NEXT: v_add_co_u32 v6, s0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v6, 0 +; GFX10-NEXT: v_add3_u32 v3, v2, v3, v5 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s18, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v5, v[1:2] -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v6, v[1:2] +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s16, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s17, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s17, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s18 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 @@ -503,18 +503,18 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s18 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v5, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v5, s0 ; GFX10-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] ; GFX10-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] ; GFX10-NEXT: s_endpgm @@ -1005,14 +1005,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, s13 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v2, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1042,12 +1042,10 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 -; GFX8-NEXT: s_sub_u32 s2, 0, s14 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] ; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 -; GFX8-NEXT: s_subb_u32 s3, 0, s15 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 @@ -1084,112 +1082,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2] +; GFX8-NEXT: v_add_u32_e64 v17, s[2:3], 1, v8 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v10, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s14 ; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4 +; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s15 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX8-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15 -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v10, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1 -; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v9, vcc +; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX8-NEXT: v_trunc_f32_e32 v4, v3 -; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2 -; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v8 -; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v4 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13 +; GFX8-NEXT: v_trunc_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX8-NEXT: s_sub_u32 s8, 0, s14 +; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v3 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v1 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 +; GFX8-NEXT: v_subbrev_u32_e64 v16, s[0:1], 0, v12, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v16 +; GFX8-NEXT: s_subb_u32 s9, 0, s15 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] -; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2 -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v12, v10, vcc -; GFX8-NEXT: v_mul_lo_u32 v5, v15, v6 -; GFX8-NEXT: v_mul_hi_u32 v10, v15, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v10, v18, v6 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v5, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, v13, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v5, v4 +; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v12, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, v14, v6 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, v13, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_addc_u32_e64 v18, s[2:3], 0, v10, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 1, v17 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v18, vcc +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v15 +; GFX8-NEXT: v_mul_hi_u32 v6, v14, v6 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v16 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13 -; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v7, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v2 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v2 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, v15, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] +; GFX8-NEXT: v_mul_lo_u32 v6, v14, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v13, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v13, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v18, v7 -; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, v15, v7 +; GFX8-NEXT: v_mul_lo_u32 v9, v14, v8 +; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, v13, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; GFX8-NEXT: v_mul_hi_u32 v7, v18, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_mul_hi_u32 v8, v14, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc ; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4 ; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] @@ -1206,15 +1205,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v5, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v8 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[5:6] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s15, v11, v[8:9] @@ -1274,13 +1272,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 @@ -1307,15 +1308,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 -; GFX9-NEXT: s_sub_u32 s2, 0, s6 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] ; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 -; GFX9-NEXT: s_subb_u32 s3, 0, s7 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 @@ -1350,134 +1348,132 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v8, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v8, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v9, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v4, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2 -; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 +; GFX9-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s4, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v12, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] -; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v15, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v18, v6 -; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v5, v14, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v16 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v12, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, v14, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v6 +; GFX9-NEXT: v_add_co_u32_e64 v17, s[2:3], 1, v8 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[2:3], 0, v10, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v17 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v18, vcc +; GFX9-NEXT: v_mul_hi_u32 v6, v14, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v15 +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v8, v5 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc +; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX9-NEXT: v_mul_lo_u32 v5, v18, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v9, v15, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v9, v18, v7 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, v15, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] -; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v15, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] +; GFX9-NEXT: v_mul_lo_u32 v6, v14, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v13, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s19, v5 -; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, v14, v8 +; GFX9-NEXT: v_mul_hi_u32 v4, v14, v4 ; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, s18, v5 -; GFX9-NEXT: v_mul_hi_u32 v13, s19, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v8 +; GFX9-NEXT: v_mul_hi_u32 v8, v14, v8 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v6, v7, v6, v8 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, s19, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s18, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s19, v7 +; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, s18, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, s19, v7 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v8, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, v9, v7 -; GFX9-NEXT: v_add3_u32 v12, v1, v12, v13 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[1:2] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX9-NEXT: v_add3_u32 v12, v8, v1, v7 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[5:6] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9] @@ -1546,14 +1542,14 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX10-NEXT: v_trunc_f32_e32 v4, v2 -; GFX10-NEXT: v_trunc_f32_e32 v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v5 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v5 -; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v2, v2 +; GFX10-NEXT: v_trunc_f32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2 +; GFX10-NEXT: v_mul_f32_e32 v5, 0xcf800000, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s1, v7, 0 @@ -1662,119 +1658,119 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v4, s17, v4 ; GFX10-NEXT: v_mul_lo_u32 v9, s17, v2 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 -; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, s17, v2 -; GFX10-NEXT: v_mul_lo_u32 v2, s18, v0 +; GFX10-NEXT: v_mul_lo_u32 v11, s18, v0 ; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, s19, v1 ; GFX10-NEXT: v_mul_lo_u32 v12, s19, v0 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 -; GFX10-NEXT: v_mul_hi_u32 v14, s19, v0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v8 +; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1 +; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0 +; GFX10-NEXT: v_mul_hi_u32 v10, s19, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX10-NEXT: v_mul_hi_u32 v2, s17, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v7 +; GFX10-NEXT: v_add_co_u32 v7, s0, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s4, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v10, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7 -; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11 -; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14 +; GFX10-NEXT: v_add_co_u32 v8, s0, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v7, 0 +; GFX10-NEXT: v_add3_u32 v9, v3, v4, v2 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v8, 0 +; GFX10-NEXT: v_add3_u32 v10, v6, v5, v10 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s4, v9, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v7, v[3:4] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v8, v[4:5] -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v7, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v9, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v10, v[3:4] +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v11, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s16, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s7, v10, v[5:6] -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v3, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s17, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v15, s0, s18, v2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s19, v0, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v5 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s7, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v18 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v8, v[5:6] +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s17, v3 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s0, s17, v3, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v14, s4 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v0, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v17, s0, s18, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v18, s1, s19, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v15 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s5, v0, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s19, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v17 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v18 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v20, v5, s1 +; GFX10-NEXT: v_sub_co_u32 v2, s1, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s1, 0, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v1, s2 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, s7, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v15, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_sub_co_u32 v11, s0, v17, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s1, 0, v7, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s7, v23, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v9, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[14:15] +; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v14, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v15, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, vcc_lo, s7, v7, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v5 +; GFX10-NEXT: v_sub_co_u32 v5, s0, v11, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v3, v6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v14, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v17, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v18, v7, s0 +; GFX10-NEXT: global_store_dwordx4 v12, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 28cb8f871cb9e..dd29ecf9dbe2e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -23846,8 +23846,17 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_mov_b32 s72, s74 ; SI-NEXT: s_mov_b32 s73, s75 ; SI-NEXT: s_mov_b32 s74, s76 -; SI-NEXT: v_readlane_b32 s75, v21, 0 -; SI-NEXT: v_readlane_b32 s76, v21, 1 +; SI-NEXT: s_mov_b32 s75, s77 +; SI-NEXT: s_mov_b32 s76, s78 +; SI-NEXT: s_mov_b32 s77, s79 +; SI-NEXT: s_mov_b32 s78, s88 +; SI-NEXT: s_mov_b32 s79, s89 +; SI-NEXT: s_mov_b32 s88, s90 +; SI-NEXT: s_mov_b32 s89, s91 +; SI-NEXT: s_mov_b32 s90, s92 +; SI-NEXT: s_mov_b32 s91, s93 +; SI-NEXT: v_readlane_b32 s92, v21, 0 +; SI-NEXT: v_readlane_b32 s93, v21, 1 ; SI-NEXT: s_cbranch_vccnz .LBB17_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -23909,16 +23918,16 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s62, s84, 16 ; SI-NEXT: s_and_b32 s73, s83, 0xffff0000 ; SI-NEXT: s_lshl_b32 s72, s83, 16 -; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 +; SI-NEXT: s_and_b32 s75, s82, 0xffff0000 ; SI-NEXT: s_lshl_b32 s74, s82, 16 -; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s81, 16 -; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s80, 16 -; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s71, 16 -; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s70, 16 +; SI-NEXT: s_and_b32 s77, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s81, 16 +; SI-NEXT: s_and_b32 s79, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s80, 16 +; SI-NEXT: s_and_b32 s89, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s71, 16 +; SI-NEXT: s_and_b32 s91, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s70, 16 ; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 ; SI-NEXT: s_lshl_b32 s94, s29, 16 ; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 @@ -23943,8 +23952,8 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_lshl_b32 s66, s19, 16 ; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 ; SI-NEXT: s_lshl_b32 s68, s18, 16 -; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s93, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s17, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v21, s6, 2 ; SI-NEXT: s_lshl_b32 s6, s16, 16 @@ -23953,230 +23962,230 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: v_readlane_b32 s6, v21, 2 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_readlane_b32 s6, v21, 3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s99, v20, 33 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_readlane_b32 s30, v20, 34 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s31, v20, 35 -; SI-NEXT: v_readlane_b32 s99, v20, 33 ; SI-NEXT: v_readlane_b32 s98, v20, 32 ; SI-NEXT: v_readlane_b32 s97, v20, 31 ; SI-NEXT: v_readlane_b32 s96, v20, 30 @@ -27466,562 +27475,737 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB19_3 ; SI-NEXT: .LBB19_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB19_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB19_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -28045,40 +28229,43 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: s_cbranch_scc0 .LBB19_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB19_3 @@ -28087,580 +28274,600 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB19_4: ; VI-NEXT: s_branch .LBB19_2 @@ -62496,214 +62703,213 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s30, v63, 34 ; SI-NEXT: v_readlane_b32 s31, v63, 35 ; SI-NEXT: v_readlane_b32 s99, v63, 33 @@ -62740,22 +62946,23 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_readlane_b32 s36, v63, 2 ; SI-NEXT: v_readlane_b32 s35, v63, 1 ; SI-NEXT: v_readlane_b32 s34, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -66011,562 +66218,737 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB43_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB43_3 ; SI-NEXT: .LBB43_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB43_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB43_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB43_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -66590,40 +66972,43 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB43_3 @@ -66632,580 +67017,600 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: ; VI-NEXT: s_branch .LBB43_2 @@ -98683,229 +99088,229 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: v_writelane_b32 v21, s8, 3 ; SI-NEXT: .LBB61_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s4, v21, 2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s4, v21, 2 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_readlane_b32 s4, v21, 3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: v_readlane_b32 s4, v21, 0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_readlane_b32 s4, v21, 1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_readlane_b32 s30, v20, 34 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -102278,562 +102683,737 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB63_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB63_3 ; SI-NEXT: .LBB63_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB63_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB63_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB63_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -102857,40 +103437,43 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: s_cbranch_scc0 .LBB63_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB63_3 @@ -102899,580 +103482,600 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB63_4: ; VI-NEXT: s_branch .LBB63_2 @@ -134979,94 +135582,92 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 ; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 ; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v32 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v31 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 ; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 ; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 ; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 ; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v60 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 ; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_branch .LBB77_5 @@ -135141,15 +135742,18 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: s_branch .LBB77_2 ; SI-NEXT: .LBB77_4: -; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v1, s59 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s36 -; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: v_mov_b32_e32 v1, s58 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s34 -; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_mov_b32_e32 v1, s57 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s56 +; SI-NEXT: v_mov_b32_e32 v61, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -135157,328 +135761,329 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_readlane_b32 s4, v62, 2 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: v_mov_b32_e32 v5, s59 -; SI-NEXT: v_mov_b32_e32 v4, s58 -; SI-NEXT: v_mov_b32_e32 v9, s57 -; SI-NEXT: v_mov_b32_e32 v6, s56 -; SI-NEXT: v_mov_b32_e32 v13, s99 -; SI-NEXT: v_mov_b32_e32 v10, s98 -; SI-NEXT: v_mov_b32_e32 v17, s97 -; SI-NEXT: v_mov_b32_e32 v14, s96 -; SI-NEXT: v_mov_b32_e32 v21, s87 -; SI-NEXT: v_mov_b32_e32 v18, s86 -; SI-NEXT: v_mov_b32_e32 v25, s85 -; SI-NEXT: v_mov_b32_e32 v22, s84 -; SI-NEXT: v_mov_b32_e32 v29, s83 -; SI-NEXT: v_mov_b32_e32 v26, s82 -; SI-NEXT: v_mov_b32_e32 v33, s81 -; SI-NEXT: v_mov_b32_e32 v30, s80 -; SI-NEXT: v_mov_b32_e32 v37, s71 -; SI-NEXT: v_mov_b32_e32 v34, s70 -; SI-NEXT: v_mov_b32_e32 v39, s69 -; SI-NEXT: v_mov_b32_e32 v38, s68 -; SI-NEXT: v_mov_b32_e32 v53, s67 -; SI-NEXT: v_mov_b32_e32 v48, s66 -; SI-NEXT: v_mov_b32_e32 v55, s65 -; SI-NEXT: v_mov_b32_e32 v54, s64 -; SI-NEXT: v_mov_b32_e32 v43, s55 -; SI-NEXT: v_mov_b32_e32 v40, s54 -; SI-NEXT: v_mov_b32_e32 v45, s53 -; SI-NEXT: v_mov_b32_e32 v44, s52 -; SI-NEXT: v_mov_b32_e32 v47, s51 -; SI-NEXT: v_mov_b32_e32 v46, s50 -; SI-NEXT: v_mov_b32_e32 v57, s49 -; SI-NEXT: v_mov_b32_e32 v56, s48 -; SI-NEXT: v_mov_b32_e32 v61, s39 -; SI-NEXT: v_mov_b32_e32 v58, s38 -; SI-NEXT: v_mov_b32_e32 v8, s35 -; SI-NEXT: v_mov_b32_e32 v24, s31 -; SI-NEXT: v_mov_b32_e32 v23, s30 +; SI-NEXT: v_mov_b32_e32 v6, s99 +; SI-NEXT: v_mov_b32_e32 v5, s98 +; SI-NEXT: v_mov_b32_e32 v8, s97 +; SI-NEXT: v_mov_b32_e32 v7, s96 +; SI-NEXT: v_mov_b32_e32 v10, s87 +; SI-NEXT: v_mov_b32_e32 v9, s86 +; SI-NEXT: v_mov_b32_e32 v12, s85 +; SI-NEXT: v_mov_b32_e32 v11, s84 +; SI-NEXT: v_mov_b32_e32 v14, s83 +; SI-NEXT: v_mov_b32_e32 v13, s82 +; SI-NEXT: v_mov_b32_e32 v16, s81 +; SI-NEXT: v_mov_b32_e32 v15, s80 +; SI-NEXT: v_mov_b32_e32 v18, s71 +; SI-NEXT: v_mov_b32_e32 v17, s70 +; SI-NEXT: v_mov_b32_e32 v20, s69 +; SI-NEXT: v_mov_b32_e32 v19, s68 +; SI-NEXT: v_mov_b32_e32 v22, s67 +; SI-NEXT: v_mov_b32_e32 v21, s66 +; SI-NEXT: v_mov_b32_e32 v24, s65 +; SI-NEXT: v_mov_b32_e32 v23, s64 +; SI-NEXT: v_mov_b32_e32 v26, s55 +; SI-NEXT: v_mov_b32_e32 v25, s54 +; SI-NEXT: v_mov_b32_e32 v28, s53 +; SI-NEXT: v_mov_b32_e32 v27, s52 +; SI-NEXT: v_mov_b32_e32 v30, s51 +; SI-NEXT: v_mov_b32_e32 v29, s50 +; SI-NEXT: v_mov_b32_e32 v32, s49 +; SI-NEXT: v_mov_b32_e32 v31, s48 +; SI-NEXT: v_mov_b32_e32 v34, s39 +; SI-NEXT: v_mov_b32_e32 v33, s38 +; SI-NEXT: v_mov_b32_e32 v36, s37 +; SI-NEXT: v_mov_b32_e32 v35, s36 +; SI-NEXT: v_mov_b32_e32 v38, s35 +; SI-NEXT: v_mov_b32_e32 v37, s34 +; SI-NEXT: v_mov_b32_e32 v48, s31 +; SI-NEXT: v_mov_b32_e32 v39, s30 ; SI-NEXT: v_mov_b32_e32 v50, s95 ; SI-NEXT: v_mov_b32_e32 v49, s94 ; SI-NEXT: v_mov_b32_e32 v52, s93 ; SI-NEXT: v_mov_b32_e32 v51, s92 -; SI-NEXT: v_mov_b32_e32 v16, s91 -; SI-NEXT: v_mov_b32_e32 v15, s90 -; SI-NEXT: v_mov_b32_e32 v28, s89 -; SI-NEXT: v_mov_b32_e32 v27, s88 +; SI-NEXT: v_mov_b32_e32 v54, s91 +; SI-NEXT: v_mov_b32_e32 v53, s90 +; SI-NEXT: v_mov_b32_e32 v40, s89 +; SI-NEXT: v_mov_b32_e32 v55, s88 ; SI-NEXT: v_mov_b32_e32 v42, s79 ; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v11, s77 -; SI-NEXT: v_mov_b32_e32 v12, s76 -; SI-NEXT: v_mov_b32_e32 v32, s75 -; SI-NEXT: v_mov_b32_e32 v31, s74 -; SI-NEXT: v_mov_b32_e32 v19, s73 -; SI-NEXT: v_mov_b32_e32 v20, s72 -; SI-NEXT: v_mov_b32_e32 v36, s63 -; SI-NEXT: v_mov_b32_e32 v35, s62 +; SI-NEXT: v_mov_b32_e32 v43, s77 +; SI-NEXT: v_mov_b32_e32 v44, s76 +; SI-NEXT: v_mov_b32_e32 v46, s75 +; SI-NEXT: v_mov_b32_e32 v45, s74 +; SI-NEXT: v_mov_b32_e32 v47, s73 +; SI-NEXT: v_mov_b32_e32 v56, s72 +; SI-NEXT: v_mov_b32_e32 v58, s63 +; SI-NEXT: v_mov_b32_e32 v57, s62 ; SI-NEXT: v_mov_b32_e32 v60, s61 ; SI-NEXT: v_mov_b32_e32 v59, s60 ; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: .LBB77_5: ; %end ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_readlane_b32 s30, v63, 34 -; SI-NEXT: v_readlane_b32 s31, v63, 35 -; SI-NEXT: v_readlane_b32 s99, v63, 33 -; SI-NEXT: v_readlane_b32 s98, v63, 32 -; SI-NEXT: v_readlane_b32 s97, v63, 31 -; SI-NEXT: v_readlane_b32 s96, v63, 30 -; SI-NEXT: v_readlane_b32 s87, v63, 29 -; SI-NEXT: v_readlane_b32 s86, v63, 28 -; SI-NEXT: v_readlane_b32 s85, v63, 27 -; SI-NEXT: v_readlane_b32 s84, v63, 26 -; SI-NEXT: v_readlane_b32 s83, v63, 25 -; SI-NEXT: v_readlane_b32 s82, v63, 24 -; SI-NEXT: v_readlane_b32 s81, v63, 23 -; SI-NEXT: v_readlane_b32 s80, v63, 22 -; SI-NEXT: v_readlane_b32 s71, v63, 21 -; SI-NEXT: v_readlane_b32 s70, v63, 20 -; SI-NEXT: v_readlane_b32 s69, v63, 19 -; SI-NEXT: v_readlane_b32 s68, v63, 18 -; SI-NEXT: v_readlane_b32 s67, v63, 17 -; SI-NEXT: v_readlane_b32 s66, v63, 16 -; SI-NEXT: v_readlane_b32 s65, v63, 15 -; SI-NEXT: v_readlane_b32 s64, v63, 14 -; SI-NEXT: v_readlane_b32 s55, v63, 13 -; SI-NEXT: v_readlane_b32 s54, v63, 12 -; SI-NEXT: v_readlane_b32 s53, v63, 11 -; SI-NEXT: v_readlane_b32 s52, v63, 10 -; SI-NEXT: v_readlane_b32 s51, v63, 9 -; SI-NEXT: v_readlane_b32 s50, v63, 8 -; SI-NEXT: v_readlane_b32 s49, v63, 7 -; SI-NEXT: v_readlane_b32 s48, v63, 6 -; SI-NEXT: v_readlane_b32 s39, v63, 5 -; SI-NEXT: v_readlane_b32 s38, v63, 4 -; SI-NEXT: v_readlane_b32 s37, v63, 3 -; SI-NEXT: v_readlane_b32 s36, v63, 2 -; SI-NEXT: v_readlane_b32 s35, v63, 1 -; SI-NEXT: v_readlane_b32 s34, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s30, v63, 34 +; SI-NEXT: v_readlane_b32 s31, v63, 35 +; SI-NEXT: v_readlane_b32 s99, v63, 33 +; SI-NEXT: v_readlane_b32 s98, v63, 32 +; SI-NEXT: v_readlane_b32 s97, v63, 31 +; SI-NEXT: v_readlane_b32 s96, v63, 30 +; SI-NEXT: v_readlane_b32 s87, v63, 29 +; SI-NEXT: v_readlane_b32 s86, v63, 28 +; SI-NEXT: v_readlane_b32 s85, v63, 27 +; SI-NEXT: v_readlane_b32 s84, v63, 26 +; SI-NEXT: v_readlane_b32 s83, v63, 25 +; SI-NEXT: v_readlane_b32 s82, v63, 24 +; SI-NEXT: v_readlane_b32 s81, v63, 23 +; SI-NEXT: v_readlane_b32 s80, v63, 22 +; SI-NEXT: v_readlane_b32 s71, v63, 21 +; SI-NEXT: v_readlane_b32 s70, v63, 20 +; SI-NEXT: v_readlane_b32 s69, v63, 19 +; SI-NEXT: v_readlane_b32 s68, v63, 18 +; SI-NEXT: v_readlane_b32 s67, v63, 17 +; SI-NEXT: v_readlane_b32 s66, v63, 16 +; SI-NEXT: v_readlane_b32 s65, v63, 15 +; SI-NEXT: v_readlane_b32 s64, v63, 14 +; SI-NEXT: v_readlane_b32 s55, v63, 13 +; SI-NEXT: v_readlane_b32 s54, v63, 12 +; SI-NEXT: v_readlane_b32 s53, v63, 11 +; SI-NEXT: v_readlane_b32 s52, v63, 10 +; SI-NEXT: v_readlane_b32 s51, v63, 9 +; SI-NEXT: v_readlane_b32 s50, v63, 8 +; SI-NEXT: v_readlane_b32 s49, v63, 7 +; SI-NEXT: v_readlane_b32 s48, v63, 6 +; SI-NEXT: v_readlane_b32 s39, v63, 5 +; SI-NEXT: v_readlane_b32 s38, v63, 4 +; SI-NEXT: v_readlane_b32 s37, v63, 3 +; SI-NEXT: v_readlane_b32 s36, v63, 2 +; SI-NEXT: v_readlane_b32 s35, v63, 1 +; SI-NEXT: v_readlane_b32 s34, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -138704,562 +139309,737 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v30 -; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mov_b32_e32 v55, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mov_b32_e32 v43, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_mov_b32_e32 v29, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB79_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 -; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v32 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 -; SI-NEXT: v_mov_b32_e32 v41, v61 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v4, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_mov_b32_e32 v6, v45 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v54 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v14, v60 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 +; SI-NEXT: v_mov_b32_e32 v15, v62 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 -; SI-NEXT: v_mov_b32_e32 v55, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_mov_b32_e32 v40, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v22 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v23, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v26, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 -; SI-NEXT: v_mov_b32_e32 v51, v47 -; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_mov_b32_e32 v53, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 ; SI-NEXT: s_branch .LBB79_3 ; SI-NEXT: .LBB79_2: -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v48, v53 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v61 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v38 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v0 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB79_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v39, v52 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v32, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v54, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_cbranch_vccnz .LBB79_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB79_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -139283,40 +140063,43 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v26, v12 ; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v24, v10 ; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v22, v8 ; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: s_cbranch_scc0 .LBB79_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB79_3 @@ -139325,580 +140108,600 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[33:34] +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[35:36] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_lshrrev_b64 v[35:36], 16, v[35:36] +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[36:37] +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v15, v18, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[37:38] +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v37, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[37:38] +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[38:39] +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v11, v13, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[48:49] +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[49:50] +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v49, v7, v9, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[50:51] +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[51:52] +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[51:52] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[52:53] +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; VI-NEXT: v_lshrrev_b64 v[52:53], 16, v[52:53] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[53:54] +; VI-NEXT: v_cndmask_b32_e32 v53, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[53:54] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[54:55] +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[39:40] +; VI-NEXT: v_cndmask_b32_e32 v39, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[39:40] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[40:41] +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[40:41] +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v31, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[41:42] +; VI-NEXT: v_cndmask_b32_e32 v41, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b64 v[54:55], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[41:42] +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v1, v50 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_mov_b32_e32 v5, v48 +; VI-NEXT: v_mov_b32_e32 v7, v38 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v11, v36 +; VI-NEXT: v_mov_b32_e32 v13, v35 +; VI-NEXT: v_mov_b32_e32 v15, v34 +; VI-NEXT: v_mov_b32_e32 v17, v41 +; VI-NEXT: v_mov_b32_e32 v19, v18 +; VI-NEXT: v_mov_b32_e32 v21, v40 +; VI-NEXT: v_mov_b32_e32 v23, v39 +; VI-NEXT: v_mov_b32_e32 v25, v54 +; VI-NEXT: v_mov_b32_e32 v27, v53 +; VI-NEXT: v_mov_b32_e32 v29, v52 +; VI-NEXT: v_mov_b32_e32 v31, v51 ; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB79_4: ; VI-NEXT: s_branch .LBB79_2 @@ -155767,9 +156570,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_writelane_b32 v41, s34, 0 ; SI-NEXT: v_writelane_b32 v41, s35, 1 ; SI-NEXT: v_writelane_b32 v41, s36, 2 @@ -155814,48 +156618,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s73, s21 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v43, s19, 0 -; SI-NEXT: v_writelane_b32 v43, s18, 1 -; SI-NEXT: v_writelane_b32 v43, s17, 2 -; SI-NEXT: v_writelane_b32 v43, s16, 3 -; SI-NEXT: s_mov_b32 s60, s24 -; SI-NEXT: s_mov_b32 s77, s28 +; SI-NEXT: v_writelane_b32 v44, s19, 0 +; SI-NEXT: v_writelane_b32 v44, s18, 1 +; SI-NEXT: v_writelane_b32 v44, s17, 2 +; SI-NEXT: v_writelane_b32 v44, s16, 3 +; SI-NEXT: s_mov_b32 s74, s29 +; SI-NEXT: s_mov_b32 s78, s28 ; SI-NEXT: s_mov_b32 s76, s27 -; SI-NEXT: s_mov_b32 s79, s26 +; SI-NEXT: s_mov_b32 s47, s26 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v43, s37, 0 ; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v42, s38, 0 +; SI-NEXT: v_writelane_b32 v43, s38, 1 ; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v42, s39, 1 +; SI-NEXT: v_writelane_b32 v43, s39, 2 ; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v42, s48, 2 +; SI-NEXT: v_writelane_b32 v43, s48, 3 ; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v42, s49, 3 +; SI-NEXT: v_writelane_b32 v43, s49, 4 ; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v42, s50, 4 +; SI-NEXT: v_writelane_b32 v43, s50, 5 ; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v42, s51, 5 +; SI-NEXT: v_writelane_b32 v43, s51, 6 ; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v42, s52, 6 +; SI-NEXT: v_writelane_b32 v43, s52, 7 ; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v42, s53, 7 +; SI-NEXT: v_writelane_b32 v43, s53, 8 ; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v42, s54, 8 -; SI-NEXT: v_writelane_b32 v42, s55, 9 +; SI-NEXT: v_writelane_b32 v43, s54, 9 +; SI-NEXT: v_writelane_b32 v43, s55, 10 +; SI-NEXT: s_mov_b32 s57, s24 ; SI-NEXT: v_readfirstlane_b32 s16, v1 ; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: v_readfirstlane_b32 s18, v5 ; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s88, v4 +; SI-NEXT: v_readfirstlane_b32 s77, v4 ; SI-NEXT: v_readfirstlane_b32 s89, v3 ; SI-NEXT: v_readfirstlane_b32 s90, v9 ; SI-NEXT: v_readfirstlane_b32 s91, v10 @@ -155868,7 +156675,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s34, v16 ; SI-NEXT: v_readfirstlane_b32 s35, v15 ; SI-NEXT: v_readfirstlane_b32 s36, v21 -; SI-NEXT: v_readfirstlane_b32 s37, v22 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 @@ -155879,43 +156685,45 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 4 +; SI-NEXT: v_writelane_b32 v44, s4, 4 ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v43, s4, 5 +; SI-NEXT: v_writelane_b32 v44, s4, 5 ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 6 +; SI-NEXT: v_writelane_b32 v44, s4, 6 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 7 +; SI-NEXT: v_writelane_b32 v44, s4, 7 ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 8 +; SI-NEXT: v_writelane_b32 v44, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v43, s4, 9 +; SI-NEXT: v_writelane_b32 v44, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 10 +; SI-NEXT: v_writelane_b32 v44, s4, 10 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s24, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 11 +; SI-NEXT: v_writelane_b32 v44, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 12 +; SI-NEXT: v_writelane_b32 v44, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: v_writelane_b32 v44, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: v_writelane_b32 v44, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: v_writelane_b32 v44, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -155928,33 +156736,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s75, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s61, v33 +; SI-NEXT: v_readfirstlane_b32 s21, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: v_writelane_b32 v44, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s40, v35 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s61, v36 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s63, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s42, v38 +; SI-NEXT: v_readfirstlane_b32 s56, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: v_readfirstlane_b32 s43, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s21, v48 +; SI-NEXT: v_readfirstlane_b32 s46, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: v_readfirstlane_b32 s42, v49 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s13, v50 ; SI-NEXT: s_waitcnt vmcnt(6) @@ -155967,43 +156775,44 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s47, v32 +; SI-NEXT: v_readfirstlane_b32 s88, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v33 +; SI-NEXT: v_readfirstlane_b32 s79, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v44, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: v_writelane_b32 v44, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: v_writelane_b32 v44, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: v_writelane_b32 v44, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: v_writelane_b32 v44, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: v_writelane_b32 v44, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: v_writelane_b32 v44, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: v_writelane_b32 v44, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: v_writelane_b32 v44, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: v_writelane_b32 v44, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: v_writelane_b32 v44, s4, 28 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -156019,42 +156828,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: v_writelane_b32 v44, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: v_writelane_b32 v44, s4, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: v_writelane_b32 v44, s4, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_writelane_b32 v44, s4, 32 ; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: v_writelane_b32 v43, s4, 32 -; SI-NEXT: v_readfirstlane_b32 s4, v40 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_writelane_b32 v43, s22, 34 -; SI-NEXT: v_writelane_b32 v43, s23, 35 -; SI-NEXT: v_writelane_b32 v43, s72, 36 -; SI-NEXT: v_writelane_b32 v43, s20, 37 -; SI-NEXT: v_writelane_b32 v43, s79, 38 -; SI-NEXT: v_writelane_b32 v43, s76, 39 -; SI-NEXT: v_writelane_b32 v43, s25, 40 -; SI-NEXT: v_writelane_b32 v43, s60, 41 -; SI-NEXT: v_writelane_b32 v43, s29, 42 -; SI-NEXT: v_writelane_b32 v43, s77, 43 -; SI-NEXT: v_writelane_b32 v43, s16, 44 -; SI-NEXT: v_writelane_b32 v43, s17, 45 -; SI-NEXT: v_writelane_b32 v43, s18, 46 -; SI-NEXT: v_writelane_b32 v43, s19, 47 -; SI-NEXT: v_writelane_b32 v43, s88, 48 -; SI-NEXT: v_writelane_b32 v43, s89, 49 -; SI-NEXT: v_writelane_b32 v43, s90, 50 -; SI-NEXT: v_writelane_b32 v43, s91, 51 -; SI-NEXT: v_writelane_b32 v43, s92, 52 -; SI-NEXT: v_writelane_b32 v43, s93, 53 -; SI-NEXT: v_writelane_b32 v43, s94, 54 -; SI-NEXT: v_writelane_b32 v43, s95, 55 +; SI-NEXT: v_writelane_b32 v44, s4, 33 +; SI-NEXT: v_writelane_b32 v44, s22, 34 +; SI-NEXT: v_writelane_b32 v44, s23, 35 +; SI-NEXT: v_writelane_b32 v44, s73, 36 +; SI-NEXT: v_writelane_b32 v44, s20, 37 +; SI-NEXT: v_writelane_b32 v44, s47, 38 +; SI-NEXT: v_writelane_b32 v44, s76, 39 +; SI-NEXT: v_writelane_b32 v44, s25, 40 +; SI-NEXT: v_writelane_b32 v44, s57, 41 +; SI-NEXT: v_writelane_b32 v44, s74, 42 +; SI-NEXT: v_writelane_b32 v44, s78, 43 +; SI-NEXT: v_writelane_b32 v44, s24, 44 +; SI-NEXT: v_writelane_b32 v44, s16, 45 +; SI-NEXT: v_writelane_b32 v44, s17, 46 +; SI-NEXT: v_writelane_b32 v44, s18, 47 +; SI-NEXT: v_writelane_b32 v44, s19, 48 +; SI-NEXT: v_writelane_b32 v44, s77, 49 +; SI-NEXT: v_writelane_b32 v44, s89, 50 +; SI-NEXT: v_writelane_b32 v44, s90, 51 +; SI-NEXT: v_writelane_b32 v44, s91, 52 +; SI-NEXT: v_writelane_b32 v44, s92, 53 +; SI-NEXT: v_writelane_b32 v44, s93, 54 +; SI-NEXT: v_writelane_b32 v44, s94, 55 +; SI-NEXT: v_writelane_b32 v44, s95, 56 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s62, v33 +; SI-NEXT: v_readfirstlane_b32 s58, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s10, v34 ; SI-NEXT: s_waitcnt vmcnt(8) @@ -156062,7 +156870,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s28, v31 ; SI-NEXT: v_readfirstlane_b32 s27, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s58, v36 +; SI-NEXT: v_readfirstlane_b32 s29, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s69, v37 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -156093,32 +156901,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 ; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 56 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 57 -; SI-NEXT: v_writelane_b32 v43, s30, 58 -; SI-NEXT: v_writelane_b32 v43, s31, 59 -; SI-NEXT: v_writelane_b32 v43, s34, 60 -; SI-NEXT: v_writelane_b32 v43, s35, 61 -; SI-NEXT: v_writelane_b32 v43, s36, 62 -; SI-NEXT: v_writelane_b32 v43, s37, 63 +; SI-NEXT: v_writelane_b32 v44, vcc_lo, 57 +; SI-NEXT: v_writelane_b32 v44, vcc_hi, 58 +; SI-NEXT: v_writelane_b32 v44, s30, 59 +; SI-NEXT: v_writelane_b32 v44, s31, 60 +; SI-NEXT: v_writelane_b32 v44, s34, 61 +; SI-NEXT: v_writelane_b32 v44, s35, 62 +; SI-NEXT: v_writelane_b32 v44, s36, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s60, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s46, v32 +; SI-NEXT: v_readfirstlane_b32 s62, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: v_readfirstlane_b32 s83, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s98, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s41, v35 +; SI-NEXT: v_readfirstlane_b32 s81, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s56, v36 +; SI-NEXT: v_readfirstlane_b32 s72, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s87, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s99, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s81, v39 +; SI-NEXT: v_readfirstlane_b32 s82, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 @@ -156130,9 +156937,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s26, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: v_readfirstlane_b32 s15, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s82, v50 +; SI-NEXT: v_readfirstlane_b32 s96, v50 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s7, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 @@ -156141,7 +156948,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s97, v32 ; SI-NEXT: s_waitcnt vmcnt(10) @@ -156162,144 +156969,146 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s65, v48 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v42, s64, 10 +; SI-NEXT: v_writelane_b32 v43, s64, 11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v42, s65, 11 +; SI-NEXT: v_writelane_b32 v43, s65, 12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v42, s67, 12 -; SI-NEXT: v_writelane_b32 v42, s84, 13 -; SI-NEXT: v_writelane_b32 v42, s85, 14 -; SI-NEXT: v_writelane_b32 v42, s86, 15 -; SI-NEXT: v_writelane_b32 v42, s87, 16 -; SI-NEXT: v_writelane_b32 v42, s8, 17 -; SI-NEXT: v_writelane_b32 v42, s99, 18 -; SI-NEXT: v_writelane_b32 v42, s12, 19 -; SI-NEXT: v_writelane_b32 v42, s44, 20 -; SI-NEXT: v_writelane_b32 v42, s97, 21 -; SI-NEXT: v_writelane_b32 v42, s83, 22 -; SI-NEXT: v_writelane_b32 v42, s82, 23 -; SI-NEXT: v_writelane_b32 v42, s98, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 25 -; SI-NEXT: v_writelane_b32 v42, s81, 26 -; SI-NEXT: v_writelane_b32 v42, s9, 27 -; SI-NEXT: v_writelane_b32 v42, s41, 28 -; SI-NEXT: v_writelane_b32 v42, s80, 29 -; SI-NEXT: v_writelane_b32 v42, s7, 30 -; SI-NEXT: v_writelane_b32 v42, s56, 31 -; SI-NEXT: v_writelane_b32 v42, s26, 32 -; SI-NEXT: v_writelane_b32 v42, s15, 33 -; SI-NEXT: v_writelane_b32 v42, s14, 34 -; SI-NEXT: v_writelane_b32 v42, s69, 35 -; SI-NEXT: v_writelane_b32 v42, s71, 36 -; SI-NEXT: v_writelane_b32 v42, s70, 37 -; SI-NEXT: v_writelane_b32 v42, s68, 38 -; SI-NEXT: v_writelane_b32 v42, s74, 39 -; SI-NEXT: v_writelane_b32 v42, s46, 40 -; SI-NEXT: v_writelane_b32 v42, s11, 41 -; SI-NEXT: v_writelane_b32 v42, s10, 42 -; SI-NEXT: v_writelane_b32 v42, s62, 43 -; SI-NEXT: v_writelane_b32 v42, s66, 44 -; SI-NEXT: v_writelane_b32 v42, s58, 45 -; SI-NEXT: v_writelane_b32 v42, s28, 46 -; SI-NEXT: v_writelane_b32 v42, s27, 47 -; SI-NEXT: v_writelane_b32 v42, s78, 48 -; SI-NEXT: v_writelane_b32 v42, s24, 49 +; SI-NEXT: v_writelane_b32 v43, s67, 13 +; SI-NEXT: v_writelane_b32 v43, s84, 14 +; SI-NEXT: v_writelane_b32 v43, s85, 15 +; SI-NEXT: v_writelane_b32 v43, s86, 16 +; SI-NEXT: v_writelane_b32 v43, s87, 17 +; SI-NEXT: v_writelane_b32 v43, s8, 18 +; SI-NEXT: v_writelane_b32 v43, s99, 19 +; SI-NEXT: v_writelane_b32 v43, s12, 20 +; SI-NEXT: v_writelane_b32 v43, s44, 21 +; SI-NEXT: v_writelane_b32 v43, s97, 22 +; SI-NEXT: v_writelane_b32 v43, s15, 23 +; SI-NEXT: v_writelane_b32 v43, s96, 24 +; SI-NEXT: v_writelane_b32 v43, s98, 25 +; SI-NEXT: v_writelane_b32 v43, s83, 26 +; SI-NEXT: v_writelane_b32 v43, s82, 27 +; SI-NEXT: v_writelane_b32 v43, s9, 28 +; SI-NEXT: v_writelane_b32 v43, s81, 29 +; SI-NEXT: v_writelane_b32 v43, s80, 30 +; SI-NEXT: v_writelane_b32 v43, s7, 31 +; SI-NEXT: v_writelane_b32 v43, s72, 32 +; SI-NEXT: v_writelane_b32 v43, s26, 33 +; SI-NEXT: v_writelane_b32 v43, s41, 34 +; SI-NEXT: v_writelane_b32 v43, s14, 35 +; SI-NEXT: v_writelane_b32 v43, s69, 36 +; SI-NEXT: v_writelane_b32 v43, s71, 37 +; SI-NEXT: v_writelane_b32 v43, s70, 38 +; SI-NEXT: v_writelane_b32 v43, s68, 39 +; SI-NEXT: v_writelane_b32 v43, s60, 40 +; SI-NEXT: v_writelane_b32 v43, s62, 41 +; SI-NEXT: v_writelane_b32 v43, s11, 42 +; SI-NEXT: v_writelane_b32 v43, s10, 43 +; SI-NEXT: v_writelane_b32 v43, s58, 44 +; SI-NEXT: v_writelane_b32 v43, s66, 45 +; SI-NEXT: v_writelane_b32 v43, s29, 46 +; SI-NEXT: v_writelane_b32 v43, s28, 47 +; SI-NEXT: v_writelane_b32 v43, s27, 48 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readlane_b32 s4, v43, 3 +; SI-NEXT: v_readlane_b32 s4, v44, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 2 +; SI-NEXT: v_readlane_b32 s5, v44, 2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 56 -; SI-NEXT: v_readlane_b32 s4, v43, 1 +; SI-NEXT: v_writelane_b32 v43, s4, 58 +; SI-NEXT: v_readlane_b32 s4, v44, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v43, 0 +; SI-NEXT: v_readlane_b32 s5, v44, 0 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: v_writelane_b32 v43, s4, 59 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s5, s73, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_mov_b32 s22, s6 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 58 +; SI-NEXT: v_writelane_b32 v43, s4, 60 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_and_b32 s5, s57, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 59 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 60 -; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: v_writelane_b32 v43, s4, 61 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 61 -; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: v_writelane_b32 v43, s4, 62 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s78, 0xff +; SI-NEXT: s_lshl_b32 s6, s74, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 62 +; SI-NEXT: v_writelane_b32 v43, s4, 63 +; SI-NEXT: s_or_b32 s4, s16, s6 ; SI-NEXT: s_and_b32 s6, s89, 0xff +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s88, 24 -; SI-NEXT: s_mov_b32 s4, s47 -; SI-NEXT: s_or_b32 s47, s16, s6 +; SI-NEXT: s_lshl_b32 s16, s77, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 0 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: v_writelane_b32 v42, s6, 1 ; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s25, s16, s6 +; SI-NEXT: s_or_b32 s76, s16, s6 ; SI-NEXT: s_and_b32 s6, s93, 0xff ; SI-NEXT: s_lshl_b32 s16, s92, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s16, s90, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s92, s17, s16 +; SI-NEXT: s_or_b32 s77, s17, s16 ; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s76, s17, s16 +; SI-NEXT: s_or_b32 s25, s17, s16 ; SI-NEXT: s_and_b32 s16, s94, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s91, s17, s16 +; SI-NEXT: s_or_b32 s74, s17, s16 ; SI-NEXT: s_and_b32 s16, s35, 0xff ; SI-NEXT: s_lshl_b32 s17, s34, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s30, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s31, 24 -; SI-NEXT: s_or_b32 s77, s18, s17 +; SI-NEXT: s_or_b32 s78, s18, s17 ; SI-NEXT: s_and_b32 s17, s39, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_or_b32 s79, s18, s17 +; SI-NEXT: s_mov_b32 s31, s88 +; SI-NEXT: s_or_b32 s88, s18, s17 ; SI-NEXT: s_and_b32 s17, s36, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s93, s18, s17 +; SI-NEXT: s_or_b32 s89, s18, s17 ; SI-NEXT: s_and_b32 s17, s51, 0xff ; SI-NEXT: s_lshl_b32 s18, s50, 8 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s48, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s89, s19, s18 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_writelane_b32 v43, s18, 49 ; SI-NEXT: s_and_b32 s18, s55, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_or_b32 s31, s19, s18 +; SI-NEXT: s_mov_b32 s73, s79 +; SI-NEXT: s_or_b32 s79, s19, s18 ; SI-NEXT: s_and_b32 s18, s52, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s53, 24 @@ -156310,7 +157119,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s64, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s60, s20, s19 +; SI-NEXT: s_or_b32 s95, s20, s19 ; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s8, 24 @@ -156326,217 +157135,226 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s97, 24 ; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s15, 0xff +; SI-NEXT: s_and_b32 s19, s41, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s7, 24 ; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s82, 0xff +; SI-NEXT: s_and_b32 s19, s96, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: s_or_b32 s23, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s15, 24 +; SI-NEXT: v_writelane_b32 v43, s12, 50 +; SI-NEXT: s_or_b32 s12, s20, s19 ; SI-NEXT: s_and_b32 s19, s26, 0xff -; SI-NEXT: s_lshl_b32 s20, s81, 8 +; SI-NEXT: s_lshl_b32 s20, s82, 8 ; SI-NEXT: s_or_b32 vcc_hi, s19, s20 ; SI-NEXT: s_and_b32 s19, s99, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 50 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: v_writelane_b32 v43, s9, 51 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s72, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s81, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 52 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s98, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s96, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s46, 0xff -; SI-NEXT: s_lshl_b32 s20, s74, 8 +; SI-NEXT: s_lshl_b32 s20, s83, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 54 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s62, 0xff +; SI-NEXT: s_lshl_b32 s20, s60, 8 ; SI-NEXT: s_or_b32 s84, s19, s20 ; SI-NEXT: s_and_b32 s19, s71, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: s_or_b32 s72, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s9, 53 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s68, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_or_b32 s57, s20, s19 ; SI-NEXT: s_and_b32 s19, s14, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s69, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 55 ; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s58, 0xff +; SI-NEXT: s_and_b32 s19, s29, 0xff ; SI-NEXT: s_lshl_b32 s20, s66, 8 ; SI-NEXT: s_or_b32 s85, s19, s20 ; SI-NEXT: s_and_b32 s19, s10, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s62, 24 -; SI-NEXT: s_or_b32 s49, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s58, 24 +; SI-NEXT: v_writelane_b32 v43, s9, 56 +; SI-NEXT: s_or_b32 s9, s20, s19 ; SI-NEXT: s_and_b32 s19, s27, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 55 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 33 -; SI-NEXT: s_or_b32 s50, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 32 +; SI-NEXT: v_writelane_b32 v43, s9, 57 +; SI-NEXT: s_or_b32 s23, s20, s19 +; SI-NEXT: s_and_b32 s19, s24, 0xff +; SI-NEXT: v_readlane_b32 s9, v44, 33 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 31 -; SI-NEXT: s_or_b32 s51, s20, s19 +; SI-NEXT: v_readlane_b32 s9, v44, 32 +; SI-NEXT: s_or_b32 s10, s20, s19 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 30 +; SI-NEXT: v_readlane_b32 s9, v44, 31 ; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: v_readlane_b32 s9, v44, 30 ; SI-NEXT: s_or_b32 s86, s19, s20 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: v_readlane_b32 s9, v44, 29 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 27 -; SI-NEXT: s_or_b32 s52, s20, s19 +; SI-NEXT: v_readlane_b32 s9, v44, 28 +; SI-NEXT: s_or_b32 s47, s20, s19 ; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 26 +; SI-NEXT: v_readlane_b32 s9, v44, 27 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 25 -; SI-NEXT: s_or_b32 s53, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 26 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 25 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 23 -; SI-NEXT: s_or_b32 s54, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 22 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 21 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 24 +; SI-NEXT: s_or_b32 s24, s20, s19 +; SI-NEXT: s_mov_b32 s92, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 23 +; SI-NEXT: s_mov_b32 s36, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 8 +; SI-NEXT: v_readlane_b32 s11, v44, 22 ; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: s_mov_b32 s62, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 21 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 19 -; SI-NEXT: s_or_b32 s55, s20, s19 -; SI-NEXT: s_mov_b32 s58, s9 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 18 +; SI-NEXT: s_mov_b32 s30, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 20 +; SI-NEXT: s_or_b32 s58, s20, s19 +; SI-NEXT: s_mov_b32 s91, s11 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v44, 19 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: s_or_b32 s64, s20, s19 -; SI-NEXT: s_and_b32 s19, s78, 0xff +; SI-NEXT: s_mov_b32 s35, s11 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v44, 18 +; SI-NEXT: s_mov_b32 s4, s46 +; SI-NEXT: s_or_b32 s46, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s24, 24 -; SI-NEXT: s_or_b32 s65, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_mov_b32 s52, s73 +; SI-NEXT: s_or_b32 s73, s20, s19 +; SI-NEXT: s_and_b32 s19, s31, 0xff ; SI-NEXT: s_lshl_b32 s20, s45, 8 ; SI-NEXT: s_or_b32 s26, s19, s20 ; SI-NEXT: s_and_b32 s19, s13, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s57, 24 -; SI-NEXT: s_or_b32 s66, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_lshl_b32 s20, s42, 24 ; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s42, 0xff -; SI-NEXT: v_readlane_b32 s88, v43, 17 +; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s43, 24 +; SI-NEXT: s_mov_b32 s53, s42 +; SI-NEXT: s_or_b32 s42, s20, s19 +; SI-NEXT: s_and_b32 s19, s56, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s59, 24 ; SI-NEXT: s_or_b32 s68, s20, s19 ; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s88, 8 +; SI-NEXT: s_lshl_b32 s20, s61, 8 +; SI-NEXT: v_readlane_b32 s93, v44, 17 ; SI-NEXT: s_or_b32 s27, s19, s20 ; SI-NEXT: s_and_b32 s19, s40, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_or_b32 s69, s20, s19 -; SI-NEXT: s_and_b32 s19, s61, 0xff -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s57, s7 +; SI-NEXT: s_lshl_b32 s20, s93, 24 +; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_mov_b32 s51, s59 +; SI-NEXT: s_mov_b32 s59, s7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s75, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 16 -; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: v_readlane_b32 s7, v44, 16 +; SI-NEXT: s_mov_b32 s48, s56 +; SI-NEXT: s_mov_b32 s56, s10 +; SI-NEXT: s_or_b32 s69, s20, s19 ; SI-NEXT: s_mov_b32 s10, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: v_readlane_b32 s7, v44, 15 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 14 -; SI-NEXT: s_or_b32 s62, s20, s19 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: v_readlane_b32 s7, v44, 14 +; SI-NEXT: s_mov_b32 s39, s75 +; SI-NEXT: s_mov_b32 s75, s94 +; SI-NEXT: s_or_b32 s94, s20, s19 ; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 13 +; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 12 +; SI-NEXT: v_readlane_b32 s7, v44, 12 ; SI-NEXT: s_or_b32 s29, s19, s20 -; SI-NEXT: s_mov_b32 s14, s7 +; SI-NEXT: s_mov_b32 s81, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 11 +; SI-NEXT: v_readlane_b32 s7, v44, 11 +; SI-NEXT: s_mov_b32 s55, s45 +; SI-NEXT: s_mov_b32 s45, s9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 10 -; SI-NEXT: s_or_b32 s80, s20, s19 -; SI-NEXT: s_mov_b32 s56, s7 +; SI-NEXT: v_readlane_b32 s7, v44, 10 +; SI-NEXT: s_mov_b32 s38, s11 +; SI-NEXT: s_or_b32 s11, s20, s19 +; SI-NEXT: s_mov_b32 s72, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 9 +; SI-NEXT: v_readlane_b32 s7, v44, 9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s81, s7 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 8 -; SI-NEXT: s_or_b32 s11, s20, s19 ; SI-NEXT: s_mov_b32 s82, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 8 +; SI-NEXT: s_or_b32 s80, s20, s19 +; SI-NEXT: s_mov_b32 s83, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 7 +; SI-NEXT: v_readlane_b32 s7, v44, 7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s96, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 6 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s63, s93 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s61, s91 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: s_mov_b32 s75, s92 -; SI-NEXT: s_or_b32 s92, s20, s19 +; SI-NEXT: v_readlane_b32 s7, v44, 6 +; SI-NEXT: s_mov_b32 s90, s31 +; SI-NEXT: s_or_b32 s31, s20, s19 ; SI-NEXT: s_mov_b32 s98, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 5 +; SI-NEXT: v_readlane_b32 s7, v44, 5 ; SI-NEXT: s_mov_b32 s44, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 4 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: s_mov_b32 s13, s94 -; SI-NEXT: s_mov_b32 s94, s21 +; SI-NEXT: v_readlane_b32 s7, v44, 4 +; SI-NEXT: s_mov_b32 s37, s43 +; SI-NEXT: s_mov_b32 s43, s93 +; SI-NEXT: s_mov_b32 s93, s21 ; SI-NEXT: s_or_b32 s21, s19, s20 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_mov_b32 s34, s4 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v42, 58 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s73, s12 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s59, s8 -; SI-NEXT: s_mov_b32 s30, s88 -; SI-NEXT: s_mov_b32 s88, s31 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: s_mov_b32 s31, s43 +; SI-NEXT: v_readlane_b32 s4, v43, 60 +; SI-NEXT: s_mov_b32 s54, s13 +; SI-NEXT: s_mov_b32 s13, s12 +; SI-NEXT: s_mov_b32 s50, s63 +; SI-NEXT: s_mov_b32 s63, s95 +; SI-NEXT: s_mov_b32 s49, s61 +; SI-NEXT: s_mov_b32 s61, s8 +; SI-NEXT: s_mov_b32 s60, s40 ; SI-NEXT: s_mov_b32 s12, s7 ; SI-NEXT: s_mov_b32 s7, s22 -; SI-NEXT: s_or_b32 s83, s20, s19 +; SI-NEXT: s_or_b32 s15, s20, s19 ; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s74, s5, 16 +; SI-NEXT: s_lshl_b32 s95, s5, 16 ; SI-NEXT: s_lshl_b32 s22, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s19, s17, 16 @@ -156548,16 +157366,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s97, s86, 16 ; SI-NEXT: s_lshl_b32 s28, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v42, 56 +; SI-NEXT: v_readlane_b32 s26, v43, 58 ; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v42, 57 -; SI-NEXT: v_readlane_b32 s35, v42, 61 +; SI-NEXT: v_readlane_b32 s27, v43, 59 +; SI-NEXT: v_readlane_b32 s66, v43, 63 ; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v42, 60 -; SI-NEXT: v_readlane_b32 s24, v42, 59 -; SI-NEXT: v_readlane_b32 s90, v42, 62 +; SI-NEXT: v_readlane_b32 s29, v43, 62 +; SI-NEXT: v_readlane_b32 s65, v43, 61 +; SI-NEXT: v_readlane_b32 s64, v42, 0 ; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: s_mov_b32 s21, s47 +; SI-NEXT: v_readlane_b32 s21, v42, 1 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s98, 3 @@ -156572,10 +157390,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s56, 3 +; SI-NEXT: s_add_i32 s5, s72, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s6, s81, 8 -; SI-NEXT: s_add_i32 s16, s82, 3 +; SI-NEXT: s_lshl_b32 s6, s82, 8 +; SI-NEXT: s_add_i32 s16, s83, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s96, 24 @@ -156584,10 +157402,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_add_i32 s6, s15, 3 +; SI-NEXT: s_add_i32 s6, s41, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s16, s41, 8 -; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: s_lshl_b32 s16, s14, 8 +; SI-NEXT: s_add_i32 s17, s81, 3 ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s16, s9, 24 @@ -156598,7 +157416,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_add_i32 s16, s93, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s91, 8 +; SI-NEXT: s_lshl_b32 s17, s39, 8 ; SI-NEXT: s_add_i32 s18, s10, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff @@ -156608,150 +157426,143 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s36, 3 +; SI-NEXT: s_add_i32 s17, s50, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s30, 8 -; SI-NEXT: s_add_i32 s19, s78, 3 +; SI-NEXT: s_lshl_b32 s18, s49, 8 +; SI-NEXT: s_add_i32 s19, s60, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s18, s43, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s18, s94, 3 +; SI-NEXT: s_add_i32 s18, s34, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s34, 8 -; SI-NEXT: s_add_i32 s20, s37, 3 +; SI-NEXT: s_lshl_b32 s19, s37, 8 +; SI-NEXT: s_add_i32 s20, s48, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s38, 24 +; SI-NEXT: s_lshl_b32 s19, s51, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_add_i32 s19, s95, 3 +; SI-NEXT: s_add_i32 s19, s90, 3 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s46, 8 -; SI-NEXT: s_add_i32 s22, s48, 3 +; SI-NEXT: s_lshl_b32 s20, s55, 8 +; SI-NEXT: s_add_i32 s22, s54, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s39, 24 +; SI-NEXT: s_lshl_b32 s20, s53, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_add_i32 s20, s58, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 18 +; SI-NEXT: s_add_i32 s20, s91, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 49 +; SI-NEXT: s_lshl_b32 s22, s35, 8 +; SI-NEXT: s_add_i32 s23, s38, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_lshl_b32 s22, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 48 -; SI-NEXT: s_add_i32 s23, s7, 3 ; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s22, s52, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_add_i32 s22, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_add_i32 s22, s92, 3 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 20 +; SI-NEXT: s_lshl_b32 s23, s36, 8 +; SI-NEXT: s_add_i32 s60, s62, 3 ; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: s_lshl_b32 s23, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 21 -; SI-NEXT: s_add_i32 s60, s7, 3 ; SI-NEXT: s_and_b32 s60, s60, 0xff +; SI-NEXT: s_lshl_b32 s23, s30, 24 ; SI-NEXT: s_lshl_b32 s60, s60, 16 ; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s23, s23, s60 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: v_readlane_b32 s7, v44, 28 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s23, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v44, 27 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 25 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_lshl_b32 s60, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: v_readlane_b32 s7, v44, 26 ; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_lshl_b32 s61, s61, 16 ; SI-NEXT: s_addk_i32 s23, 0x300 ; SI-NEXT: s_or_b32 s60, s60, s61 ; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: v_readlane_b32 s7, v44, 32 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_add_i32 s60, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: v_readlane_b32 s7, v44, 31 ; SI-NEXT: s_and_b32 s60, s60, 0xff ; SI-NEXT: s_lshl_b32 s61, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: v_readlane_b32 s7, v44, 29 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_lshl_b32 s61, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: v_readlane_b32 s7, v44, 30 ; SI-NEXT: s_add_i32 s62, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 47 +; SI-NEXT: v_readlane_b32 s7, v43, 48 ; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s59, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 46 +; SI-NEXT: v_readlane_b32 s7, v43, 47 ; SI-NEXT: s_lshl_b32 s62, s62, 16 ; SI-NEXT: s_addk_i32 s60, 0x300 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 32 +; SI-NEXT: v_readlane_b32 s7, v44, 33 ; SI-NEXT: s_or_b32 s61, s61, s62 ; SI-NEXT: s_and_b32 s60, s60, 0xffff ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 33 +; SI-NEXT: v_readlane_b32 s7, v44, 44 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 45 +; SI-NEXT: v_readlane_b32 s7, v43, 46 ; SI-NEXT: s_add_i32 s57, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 44 +; SI-NEXT: v_readlane_b32 s7, v43, 45 ; SI-NEXT: s_lshl_b32 s56, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 43 +; SI-NEXT: v_readlane_b32 s7, v43, 44 ; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 42 +; SI-NEXT: v_readlane_b32 s7, v43, 43 ; SI-NEXT: s_add_i32 s46, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 41 +; SI-NEXT: v_readlane_b32 s7, v43, 42 ; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 38 +; SI-NEXT: v_readlane_b32 s7, v43, 39 ; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 35 +; SI-NEXT: v_readlane_b32 s7, v43, 36 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 34 +; SI-NEXT: v_readlane_b32 s7, v43, 35 ; SI-NEXT: s_and_b32 s45, s45, 0xff ; SI-NEXT: s_add_i32 s14, s7, 3 ; SI-NEXT: s_or_b32 s42, s42, s45 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: v_readlane_b32 s7, v42, 40 +; SI-NEXT: v_readlane_b32 s7, v43, 41 ; SI-NEXT: s_and_b32 s57, s57, 0xff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s42, 0xffff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 39 +; SI-NEXT: v_readlane_b32 s7, v43, 40 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s57, s14, s15 ; SI-NEXT: s_and_b32 s14, s44, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 37 +; SI-NEXT: v_readlane_b32 s7, v43, 38 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 36 +; SI-NEXT: v_readlane_b32 s7, v43, 37 ; SI-NEXT: s_add_i32 s40, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_and_b32 s40, s40, 0xff @@ -156766,15 +157577,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s58, s59, s58 ; SI-NEXT: s_or_b32 s59, s15, s14 ; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v42, 31 +; SI-NEXT: v_readlane_b32 s6, v43, 32 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 28 +; SI-NEXT: v_readlane_b32 s7, v43, 29 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 25 +; SI-NEXT: v_readlane_b32 s7, v43, 26 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 25 ; SI-NEXT: s_add_i32 s24, s7, 3 ; SI-NEXT: s_and_b32 s11, s24, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 @@ -156782,47 +157593,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s11 ; SI-NEXT: s_or_b32 s8, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v42, 32 +; SI-NEXT: v_readlane_b32 s6, v43, 33 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 26 +; SI-NEXT: v_readlane_b32 s7, v43, 27 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s11, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 16 +; SI-NEXT: v_readlane_b32 s7, v43, 17 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 18 +; SI-NEXT: v_readlane_b32 s7, v43, 19 ; SI-NEXT: s_add_i32 s12, s7, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v42, 33 +; SI-NEXT: v_readlane_b32 s7, v43, 34 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_add_i32 s13, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 30 +; SI-NEXT: v_readlane_b32 s7, v43, 31 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_and_b32 s11, s13, 0xff ; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 22 +; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 23 +; SI-NEXT: v_readlane_b32 s7, v43, 24 ; SI-NEXT: s_add_i32 s25, s7, 3 ; SI-NEXT: s_and_b32 s12, s25, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v42, 29 +; SI-NEXT: v_readlane_b32 s7, v43, 30 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s9, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v42, 27 -; SI-NEXT: v_readlane_b32 s11, v42, 20 +; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: v_readlane_b32 s11, v43, 21 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v42, 21 +; SI-NEXT: v_readlane_b32 s9, v43, 22 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_lshl_b32 s9, s9, 24 @@ -156830,15 +157641,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s11 ; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v42, 19 +; SI-NEXT: v_readlane_b32 s9, v43, 20 ; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v42, 17 -; SI-NEXT: v_readlane_b32 s12, v42, 14 +; SI-NEXT: v_readlane_b32 s11, v43, 18 +; SI-NEXT: v_readlane_b32 s12, v43, 15 ; SI-NEXT: s_and_b32 s9, s21, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 15 +; SI-NEXT: v_readlane_b32 s11, v43, 16 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 ; SI-NEXT: s_lshl_b32 s11, s11, 24 @@ -156846,15 +157657,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v42, 13 +; SI-NEXT: v_readlane_b32 s11, v43, 14 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v42, 12 -; SI-NEXT: v_readlane_b32 s13, v42, 10 +; SI-NEXT: v_readlane_b32 s12, v43, 13 +; SI-NEXT: v_readlane_b32 s13, v43, 11 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 11 +; SI-NEXT: v_readlane_b32 s12, v43, 12 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 24 @@ -156862,16 +157673,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v42, 9 +; SI-NEXT: v_readlane_b32 s12, v43, 10 ; SI-NEXT: s_add_i32 s15, s16, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v42, 8 -; SI-NEXT: v_readlane_b32 s16, v42, 6 +; SI-NEXT: v_readlane_b32 s13, v43, 9 +; SI-NEXT: v_readlane_b32 s16, v43, 7 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 7 +; SI-NEXT: v_readlane_b32 s13, v43, 8 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_lshl_b32 s13, s13, 24 @@ -156879,16 +157690,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s13, s16 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v42, 5 +; SI-NEXT: v_readlane_b32 s13, v43, 6 ; SI-NEXT: s_add_i32 s40, s17, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v42, 4 -; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s16, v43, 5 +; SI-NEXT: v_readlane_b32 s17, v43, 3 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: v_readlane_b32 s16, v43, 4 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 @@ -156896,16 +157707,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: v_readlane_b32 s16, v43, 2 ; SI-NEXT: s_add_i32 s41, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: v_readlane_b32 s17, v43, 1 +; SI-NEXT: v_readlane_b32 s18, v44, 63 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v43, 0 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -156914,16 +157725,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 61 +; SI-NEXT: v_readlane_b32 s16, v44, 62 ; SI-NEXT: s_add_i32 s42, s19, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 60 -; SI-NEXT: v_readlane_b32 s19, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v44, 61 +; SI-NEXT: v_readlane_b32 s19, v44, 59 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 59 +; SI-NEXT: v_readlane_b32 s18, v44, 60 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -156931,16 +157742,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v43, 57 +; SI-NEXT: v_readlane_b32 s18, v44, 58 ; SI-NEXT: s_add_i32 s43, s20, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v43, 56 -; SI-NEXT: v_readlane_b32 s20, v43, 54 +; SI-NEXT: v_readlane_b32 s19, v44, 57 +; SI-NEXT: v_readlane_b32 s20, v44, 55 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 55 +; SI-NEXT: v_readlane_b32 s19, v44, 56 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_lshl_b32 s19, s19, 24 @@ -156948,15 +157759,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v43, 53 +; SI-NEXT: v_readlane_b32 s19, v44, 54 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v43, 52 -; SI-NEXT: v_readlane_b32 s21, v43, 50 +; SI-NEXT: v_readlane_b32 s20, v44, 53 +; SI-NEXT: v_readlane_b32 s21, v44, 51 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 51 +; SI-NEXT: v_readlane_b32 s20, v44, 52 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_lshl_b32 s20, s20, 24 @@ -156964,16 +157775,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v43, 49 +; SI-NEXT: v_readlane_b32 s20, v44, 50 ; SI-NEXT: s_add_i32 s44, s22, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v43, 48 -; SI-NEXT: v_readlane_b32 s22, v43, 46 +; SI-NEXT: v_readlane_b32 s21, v44, 49 +; SI-NEXT: v_readlane_b32 s22, v44, 47 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v43, 47 +; SI-NEXT: v_readlane_b32 s21, v44, 48 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 @@ -156982,16 +157793,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 43 +; SI-NEXT: v_readlane_b32 s20, v44, 43 ; SI-NEXT: s_add_i32 s45, s23, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v43, 42 -; SI-NEXT: v_readlane_b32 s23, v43, 44 +; SI-NEXT: v_readlane_b32 s22, v44, 42 +; SI-NEXT: v_readlane_b32 s23, v44, 45 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v43, 45 +; SI-NEXT: v_readlane_b32 s22, v44, 46 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s22, s22, 24 @@ -157000,15 +157811,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 41 +; SI-NEXT: v_readlane_b32 s20, v44, 41 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v43, 40 -; SI-NEXT: v_readlane_b32 s24, v43, 38 +; SI-NEXT: v_readlane_b32 s23, v44, 40 +; SI-NEXT: v_readlane_b32 s24, v44, 38 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v43, 39 +; SI-NEXT: v_readlane_b32 s23, v44, 39 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s23, s23, 24 @@ -157017,361 +157828,367 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s23, s23, s24 ; SI-NEXT: s_or_b32 s20, s23, s20 ; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v43, 37 +; SI-NEXT: v_readlane_b32 s20, v44, 37 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v43, 36 -; SI-NEXT: v_readlane_b32 s25, v43, 34 +; SI-NEXT: v_readlane_b32 s24, v44, 36 +; SI-NEXT: v_readlane_b32 s25, v44, 34 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 35 +; SI-NEXT: v_readlane_b32 s24, v44, 35 ; SI-NEXT: s_and_b32 s25, s25, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s24, s24, 24 ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: s_and_b32 s46, s46, 0xff ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v43, 3 -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: v_readlane_b32 s24, v44, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v43, 2 -; SI-NEXT: v_readlane_b32 s26, v43, 1 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: v_readlane_b32 s25, v44, 2 +; SI-NEXT: v_readlane_b32 s26, v44, 1 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s56, s46, s47 -; SI-NEXT: s_add_i32 s47, s58, 0x3000000 -; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v43, 0 +; SI-NEXT: v_readlane_b32 s25, v44, 0 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_and_b32 s73, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s9, 16 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_addk_i32 s24, 0x300 ; SI-NEXT: s_lshl_b32 s25, s25, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_and_b32 s63, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s17, 16 -; SI-NEXT: v_writelane_b32 v42, s9, 50 -; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_lshl_b32 s7, s10, 16 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_and_b32 s89, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s17, 16 +; SI-NEXT: s_and_b32 s17, s13, 0xffff0000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_and_b32 s74, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: v_writelane_b32 v43, s17, 49 +; SI-NEXT: s_and_b32 s63, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s11, 16 +; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s11, 50 +; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s9, 51 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s56, 0xffff +; SI-NEXT: v_writelane_b32 v43, s7, 52 ; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: s_or_b32 s56, s46, s47 +; SI-NEXT: s_add_i32 s47, s58, 0x3000000 +; SI-NEXT: s_add_i32 s58, s59, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 53 +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_add_i32 s57, s57, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 54 +; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s46, s60, 0x3000000 ; SI-NEXT: s_add_i32 s56, s56, 0x3000000 -; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 -; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 0x3000000 -; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 ; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: v_writelane_b32 v43, s7, 55 +; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 ; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 +; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 +; SI-NEXT: s_and_b32 s66, s23, 0xffff0000 ; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s90, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s22, 16 -; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s64, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s22, 16 +; SI-NEXT: s_and_b32 s76, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s75, s19, 0xffff0000 +; SI-NEXT: s_and_b32 s77, s19, 0xffff0000 ; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s18, 16 -; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s78, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 ; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s12, 16 -; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s11, 16 -; SI-NEXT: s_and_b32 s23, s10, 0xffff0000 +; SI-NEXT: s_and_b32 s75, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s12, 16 +; SI-NEXT: s_and_b32 s13, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s10, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 ; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s7, 56 ; SI-NEXT: s_lshl_b32 s57, s57, 16 -; SI-NEXT: s_and_b32 s49, s56, 0xffff0000 +; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 ; SI-NEXT: s_lshl_b32 s8, s56, 16 -; SI-NEXT: s_and_b32 s51, s47, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s47, 16 -; SI-NEXT: s_and_b32 s52, s46, 0xffff0000 +; SI-NEXT: s_and_b32 s56, s47, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s47, 16 +; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 ; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s54, s45, 0xffff0000 -; SI-NEXT: s_lshl_b32 s53, s45, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 +; SI-NEXT: s_and_b32 s24, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_and_b32 s58, s44, 0xffff0000 ; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s43, 16 -; SI-NEXT: s_and_b32 s66, s42, 0xffff0000 +; SI-NEXT: s_and_b32 s73, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s43, 16 +; SI-NEXT: s_and_b32 s67, s42, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s42, 16 ; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s67, s41, 16 -; SI-NEXT: s_and_b32 s69, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s41, 16 +; SI-NEXT: s_and_b32 s70, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s86, s40, 16 -; SI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s15, 16 -; SI-NEXT: s_and_b32 s80, s14, 0xffff0000 +; SI-NEXT: s_and_b32 s94, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s15, 16 +; SI-NEXT: s_and_b32 s11, s14, 0xffff0000 ; SI-NEXT: s_lshl_b32 s85, s14, 16 -; SI-NEXT: s_and_b32 s92, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s11, s5, 16 -; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 +; SI-NEXT: s_and_b32 s31, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s5, 16 +; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 55 +; SI-NEXT: v_writelane_b32 v43, s7, 57 ; SI-NEXT: .LBB89_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s4, v43, 49 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 50 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 50 +; SI-NEXT: v_readlane_b32 s4, v43, 51 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_readlane_b32 s4, v42, 51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 52 +; SI-NEXT: v_readlane_b32 s4, v43, 52 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 53 +; SI-NEXT: v_readlane_b32 s4, v43, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v42, 54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_readlane_b32 s4, v43, 54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 55 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 55 +; SI-NEXT: v_readlane_b32 s4, v43, 56 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 57 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s87 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s84 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -157415,6 +158232,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -157425,99 +158243,109 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s58, v43, 19 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_mov_b32 s95, s47 -; SI-NEXT: s_mov_b32 s94, s21 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: v_readlane_b32 s56, v43, 10 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: v_readlane_b32 s30, v43, 17 -; SI-NEXT: v_readlane_b32 s98, v43, 6 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s31, s43 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: v_readlane_b32 s15, v43, 14 -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: v_readlane_b32 s41, v43, 13 -; SI-NEXT: v_readlane_b32 s44, v43, 5 -; SI-NEXT: v_readlane_b32 s9, v43, 11 -; SI-NEXT: v_readlane_b32 s14, v43, 12 -; SI-NEXT: v_readlane_b32 s81, v43, 9 -; SI-NEXT: v_readlane_b32 s10, v43, 16 -; SI-NEXT: v_readlane_b32 s12, v43, 4 -; SI-NEXT: v_readlane_b32 s96, v43, 7 -; SI-NEXT: v_readlane_b32 s82, v43, 8 -; SI-NEXT: v_readlane_b32 s71, v43, 15 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: v_readlane_b32 s92, v44, 24 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: v_readlane_b32 s91, v44, 20 +; SI-NEXT: s_mov_b32 s90, s88 +; SI-NEXT: v_readlane_b32 s36, v44, 23 +; SI-NEXT: v_readlane_b32 s35, v44, 19 +; SI-NEXT: v_readlane_b32 s62, v44, 22 +; SI-NEXT: v_readlane_b32 s38, v44, 18 +; SI-NEXT: s_mov_b32 s34, s46 +; SI-NEXT: s_mov_b32 s93, s21 +; SI-NEXT: s_mov_b32 s37, s43 +; SI-NEXT: s_mov_b32 s39, s75 +; SI-NEXT: v_readlane_b32 s72, v44, 10 +; SI-NEXT: s_mov_b32 s50, s63 +; SI-NEXT: s_mov_b32 s51, s59 +; SI-NEXT: s_mov_b32 s48, s56 +; SI-NEXT: v_readlane_b32 s30, v44, 21 +; SI-NEXT: s_mov_b32 s49, s61 +; SI-NEXT: s_mov_b32 s52, s79 +; SI-NEXT: v_readlane_b32 s98, v44, 6 +; SI-NEXT: s_mov_b32 s55, s45 +; SI-NEXT: v_readlane_b32 s43, v44, 17 +; SI-NEXT: s_mov_b32 s60, s40 +; SI-NEXT: v_readlane_b32 s41, v44, 14 +; SI-NEXT: s_mov_b32 s53, s42 +; SI-NEXT: s_mov_b32 s54, s13 +; SI-NEXT: v_readlane_b32 s14, v44, 13 +; SI-NEXT: v_readlane_b32 s44, v44, 5 +; SI-NEXT: v_readlane_b32 s9, v44, 11 +; SI-NEXT: v_readlane_b32 s81, v44, 12 +; SI-NEXT: v_readlane_b32 s82, v44, 9 +; SI-NEXT: v_readlane_b32 s10, v44, 16 +; SI-NEXT: v_readlane_b32 s12, v44, 4 +; SI-NEXT: v_readlane_b32 s96, v44, 7 +; SI-NEXT: v_readlane_b32 s83, v44, 8 +; SI-NEXT: v_readlane_b32 s71, v44, 15 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; kill: killed $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; kill: killed $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr87 -; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: @@ -168901,23 +169729,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: v_writelane_b32 v63, s34, 0 ; SI-NEXT: v_writelane_b32 v63, s35, 1 ; SI-NEXT: v_writelane_b32 v63, s36, 2 @@ -168954,1640 +169782,1859 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v63, s99, 33 ; SI-NEXT: v_writelane_b32 v63, s30, 34 ; SI-NEXT: v_writelane_b32 v63, s31, 35 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v46, v21 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v29 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v38 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v48 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v58 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v59 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v54 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v49 +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v40 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v45 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v41 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v43 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v45 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s28 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_mov_b32_e32 v42, v37 -; SI-NEXT: v_alignbit_b32 v37, v2, v11, 16 -; SI-NEXT: v_alignbit_b32 v11, v44, v4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_readfirstlane_b32 s5, v11 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: v_alignbit_b32 v2, v2, v15, 16 -; SI-NEXT: v_writelane_b32 v62, s7, 1 -; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v14, v52, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v19, v2, v8, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: v_alignbit_b32 v2, v2, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v56 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_alignbit_b32 v47, v45, v47, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_readfirstlane_b32 s5, v47 -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v58 -; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_mov_b32_e32 v4, v58 -; SI-NEXT: v_alignbit_b32 v58, v8, v41, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_readfirstlane_b32 s5, v58 -; SI-NEXT: v_alignbit_b32 v2, v2, v61, 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v2, v2, v60, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v23, v22 -; SI-NEXT: v_mov_b32_e32 v40, v36 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_alignbit_b32 v41, v15, v6, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v41 -; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 8 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_alignbit_b32 v59, v1, v13, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s5, v59 -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 8 -; SI-NEXT: v_alignbit_b32 v61, v1, v17, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v2, v2, v21, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_alignbit_b32 v2, v2, v12, 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_alignbit_b32 v60, v2, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_alignbit_b32 v1, v2, v46, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v60 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 8 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: s_lshr_b32 s7, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v54 +; SI-NEXT: s_lshr_b32 s65, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v42 +; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: s_lshr_b32 s69, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: s_lshr_b32 s91, s4, 16 +; SI-NEXT: v_mov_b32_e32 v30, v51 +; SI-NEXT: v_readfirstlane_b32 s4, v47 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: s_lshr_b32 s37, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v23 -; SI-NEXT: v_mov_b32_e32 v5, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v25, v2, v26, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 -; SI-NEXT: v_readfirstlane_b32 s5, v25 -; SI-NEXT: v_alignbit_b32 v2, v2, v16, 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v2, v30, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_readfirstlane_b32 s5, v22 -; SI-NEXT: v_alignbit_b32 v2, v2, v27, 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: v_alignbit_b32 v17, v2, v36, 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_readfirstlane_b32 s5, v17 -; SI-NEXT: v_alignbit_b32 v2, v2, v34, 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_mov_b32_e32 v5, v39 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshr_b32 s89, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: s_lshr_b64 s[50:51], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: s_lshr_b32 s57, s4, 16 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshr_b32 s79, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_mov_b32_e32 v9, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_mov_b32_e32 v20, v21 +; SI-NEXT: v_readfirstlane_b32 s78, v18 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_mov_b32_e32 v18, v22 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v23 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: v_mov_b32_e32 v21, v25 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v27 +; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_mov_b32_e32 v1, v52 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: s_lshr_b32 s29, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v40 +; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 +; SI-NEXT: s_mov_b32 s9, s96 +; SI-NEXT: v_readfirstlane_b32 s88, v60 +; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 +; SI-NEXT: v_readfirstlane_b32 s64, v16 +; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 +; SI-NEXT: s_mov_b32 s87, s84 +; SI-NEXT: v_readfirstlane_b32 s68, v48 +; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 +; SI-NEXT: s_mov_b32 s81, s70 +; SI-NEXT: v_readfirstlane_b32 s90, v30 +; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 +; SI-NEXT: s_mov_b32 s67, s38 +; SI-NEXT: v_readfirstlane_b32 s36, v3 +; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 +; SI-NEXT: s_mov_b32 s53, s98 +; SI-NEXT: s_mov_b32 s31, s82 +; SI-NEXT: v_readfirstlane_b32 s56, v7 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_mov_b32 s51, s94 +; SI-NEXT: s_lshr_b64 s[74:75], s[78:79], 16 +; SI-NEXT: s_mov_b32 s93, s74 +; SI-NEXT: v_readfirstlane_b32 s72, v19 +; SI-NEXT: s_lshr_b64 s[60:61], s[72:73], 16 +; SI-NEXT: s_mov_b32 s77, s60 +; SI-NEXT: v_readfirstlane_b32 s58, v21 +; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 +; SI-NEXT: s_mov_b32 s63, s54 +; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: s_lshr_b64 s[42:43], s[44:45], 16 +; SI-NEXT: s_mov_b32 s47, s42 +; SI-NEXT: v_mov_b32_e32 v26, v37 +; SI-NEXT: v_readfirstlane_b32 s28, v26 +; SI-NEXT: s_lshr_b64 s[26:27], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s26 +; SI-NEXT: v_readfirstlane_b32 s22, v36 +; SI-NEXT: v_readfirstlane_b32 s18, v49 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_mov_b32_e32 v1, v56 +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v50 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 +; SI-NEXT: s_lshr_b32 s78, s96, 8 +; SI-NEXT: s_lshr_b32 s61, s84, 8 +; SI-NEXT: s_lshr_b32 s72, s70, 8 +; SI-NEXT: s_lshr_b32 s75, s38, 8 +; SI-NEXT: s_lshr_b32 s58, s98, 8 +; SI-NEXT: s_lshr_b32 s43, s82, 8 +; SI-NEXT: s_lshr_b32 s44, s94, 8 +; SI-NEXT: s_mov_b32 s64, s74 +; SI-NEXT: s_lshr_b32 s27, s74, 8 +; SI-NEXT: s_mov_b32 s90, s60 +; SI-NEXT: s_lshr_b32 s28, s60, 8 +; SI-NEXT: s_lshr_b32 s74, s54, 8 +; SI-NEXT: s_mov_b32 s68, s42 +; SI-NEXT: s_mov_b32 s56, s26 ; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v43 -; SI-NEXT: v_mov_b32_e32 v31, v20 -; SI-NEXT: v_mov_b32_e32 v20, v34 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_alignbit_b32 v30, v2, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_alignbit_b32 v2, v2, v39, 16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s5, v30 -; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, v36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v57, v2, v39, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_readfirstlane_b32 s5, v57 -; SI-NEXT: v_alignbit_b32 v2, v2, v50, 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v46, v2, v38, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_readfirstlane_b32 s5, v46 -; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[4:5], 8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v2, v53, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 -; SI-NEXT: v_mov_b32_e32 v32, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v25 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v57 +; SI-NEXT: s_lshr_b32 s23, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v59 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_lshr_b32 s13, s4, 16 +; SI-NEXT: s_mov_b32 s5, s13 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s5, v56 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, vcc_lo +; SI-NEXT: s_mov_b32 s88, vcc_lo +; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 4 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 5 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 3 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 1 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 10 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 11 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 9 +; SI-NEXT: s_lshr_b64 vcc, s[86:87], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 7 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 17 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 14 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 15 +; SI-NEXT: s_lshr_b64 vcc, s[80:81], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 12 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 13 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 23 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 21 +; SI-NEXT: s_lshr_b64 vcc, s[66:67], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 19 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 28 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 29 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 26 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 27 +; SI-NEXT: s_lshr_b64 vcc, s[52:53], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 25 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 34 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 35 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 33 +; SI-NEXT: s_lshr_b64 vcc, s[30:31], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 31 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 40 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 41 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 38 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 39 +; SI-NEXT: s_lshr_b64 vcc, s[50:51], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 36 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 37 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 44 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 45 +; SI-NEXT: s_lshr_b64 vcc, s[92:93], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 42 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 43 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 52 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 53 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 50 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 51 +; SI-NEXT: s_lshr_b64 vcc, s[76:77], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 58 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 59 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 56 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 57 +; SI-NEXT: s_lshr_b64 vcc, s[62:63], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 54 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 55 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 1 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 62 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 63 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 60 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 61 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 7 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 4 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 5 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 +; SI-NEXT: s_mov_b32 s25, s34 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 3 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 12 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 13 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 10 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 11 +; SI-NEXT: s_lshr_b64 vcc, s[24:25], 8 +; SI-NEXT: s_mov_b32 s17, s14 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 9 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 19 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 17 +; SI-NEXT: s_lshr_b64 vcc, s[16:17], 8 +; SI-NEXT: s_mov_b32 s11, s20 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 14 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 15 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 25 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 23 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 21 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 33 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 16 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 31 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 +; SI-NEXT: v_writelane_b32 v61, vcc_lo, 28 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v23 +; SI-NEXT: s_lshr_b32 s22, s42, 8 +; SI-NEXT: s_lshr_b32 s21, s26, 8 +; SI-NEXT: s_lshr_b32 s18, s34, 8 +; SI-NEXT: s_mov_b32 s36, s14 +; SI-NEXT: s_lshr_b32 s15, s14, 8 +; SI-NEXT: s_mov_b32 s14, s20 +; SI-NEXT: s_lshr_b32 s12, s20, 8 +; SI-NEXT: v_writelane_b32 v61, vcc_hi, 29 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v57 +; SI-NEXT: v_mov_b32_e32 v59, v30 +; SI-NEXT: v_mov_b32_e32 v31, v51 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_mov_b32_e32 v30, v39 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v39, v21 +; SI-NEXT: v_mov_b32_e32 v21, v20 +; SI-NEXT: v_mov_b32_e32 v34, v18 +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v37, v17 +; SI-NEXT: v_mov_b32_e32 v51, v33 +; SI-NEXT: v_mov_b32_e32 v17, v9 +; SI-NEXT: v_mov_b32_e32 v9, v10 +; SI-NEXT: v_mov_b32_e32 v26, v25 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v55, v49 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: v_mov_b32_e32 v40, v36 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: v_writelane_b32 v62, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v31, v46 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: v_writelane_b32 v62, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: v_writelane_b32 v62, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: v_writelane_b32 v62, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v7, v37 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v62, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_writelane_b32 v62, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_writelane_b32 v62, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_writelane_b32 v62, s5, 19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_writelane_b32 v62, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_writelane_b32 v62, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: v_writelane_b32 v62, s4, 24 +; SI-NEXT: v_writelane_b32 v62, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_writelane_b32 v62, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_writelane_b32 v62, s4, 28 +; SI-NEXT: v_writelane_b32 v62, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: v_writelane_b32 v62, s5, 1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_writelane_b32 v62, s4, 30 +; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v30, v12 +; SI-NEXT: v_writelane_b32 v62, s4, 34 +; SI-NEXT: v_writelane_b32 v62, s5, 35 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_writelane_b32 v62, s4, 36 +; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: v_mov_b32_e32 v39, v25 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: v_writelane_b32 v62, s5, 43 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_writelane_b32 v62, s4, 46 +; SI-NEXT: v_writelane_b32 v62, s5, 47 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v29, v33 +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: v_writelane_b32 v62, s5, 49 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_writelane_b32 v62, s4, 50 +; SI-NEXT: v_writelane_b32 v62, s5, 51 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 52 +; SI-NEXT: v_writelane_b32 v62, s5, 53 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 54 +; SI-NEXT: v_writelane_b32 v62, s5, 55 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 56 +; SI-NEXT: v_writelane_b32 v62, s5, 57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 58 +; SI-NEXT: v_writelane_b32 v62, s5, 59 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 60 +; SI-NEXT: v_writelane_b32 v62, s5, 61 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 62 +; SI-NEXT: v_writelane_b32 v62, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 0 +; SI-NEXT: v_writelane_b32 v61, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 2 +; SI-NEXT: v_writelane_b32 v61, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s5, 19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v61, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: v_writelane_b32 v61, s20, 28 +; SI-NEXT: v_writelane_b32 v61, s21, 29 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s20, 30 +; SI-NEXT: v_writelane_b32 v61, s21, 31 +; SI-NEXT: v_writelane_b32 v61, s88, 32 +; SI-NEXT: v_writelane_b32 v61, s89, 33 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: .LBB91_3: ; %Flow -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, vcc ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s4, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v10, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s52, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s86, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s80, v12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 +; SI-NEXT: s_mov_b32 s7, s9 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_writelane_b32 v61, s6, 26 +; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: v_writelane_b32 v61, s7, 27 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s66, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b32 s19, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_mov_b32 s17, s26 +; SI-NEXT: s_mov_b32 s11, s20 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s22, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v15, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v46, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_readfirstlane_b32 s6, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v53 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v57, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshr_b32 s23, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s28, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v16, v7, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v30, v6, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_alignbit_b32 v18, v9, v7, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s38, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s90, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s30, v23 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s76, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s62, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_readfirstlane_b32 s87, v46 -; SI-NEXT: v_readfirstlane_b32 s81, v57 -; SI-NEXT: v_readfirstlane_b32 s67, v30 -; SI-NEXT: s_lshr_b64 s[54:55], s[66:67], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[66:67], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[66:67], 8 -; SI-NEXT: s_lshr_b64 s[66:67], s[80:81], 24 -; SI-NEXT: s_lshr_b64 s[70:71], s[80:81], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[80:81], 8 -; SI-NEXT: s_lshr_b64 s[80:81], s[86:87], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[86:87], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[86:87], 8 -; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v17, v7, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s53, v17 -; SI-NEXT: s_lshr_b64 s[48:49], s[52:53], 24 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v21, v12, v10, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_readfirstlane_b32 s56, v21 -; SI-NEXT: s_lshr_b64 s[50:51], s[52:53], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 8 -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: s_lshr_b32 s29, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_readfirstlane_b32 s44, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 +; SI-NEXT: v_readfirstlane_b32 s6, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_alignbit_b32 v22, v9, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: s_lshr_b32 s45, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s58, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s42, v23 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; SI-NEXT: v_readfirstlane_b32 s39, v22 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_lshr_b32 s59, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_lshr_b32 s73, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v11 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: s_lshr_b32 s79, s6, 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 +; SI-NEXT: s_mov_b32 s63, s54 +; SI-NEXT: s_lshr_b64 s[60:61], s[44:45], 16 +; SI-NEXT: s_mov_b32 s47, s60 +; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s42 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 +; SI-NEXT: s_mov_b32 s25, s34 +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: s_mov_b32 s5, vcc_lo +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v8 +; SI-NEXT: s_lshr_b32 s22, s60, 8 +; SI-NEXT: s_lshr_b32 s21, s42, 8 +; SI-NEXT: s_lshr_b32 s18, s34, 8 +; SI-NEXT: s_lshr_b32 s12, s20, 8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 -; SI-NEXT: s_lshr_b64 s[36:37], s[38:39], 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v24, v15, v13, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v25, v10, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s31, v25 -; SI-NEXT: v_readfirstlane_b32 s26, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b64 s[94:95], s[30:31], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[30:31], 8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s20, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v27, v18, v16, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v60, v12, v3, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s91, v60 -; SI-NEXT: v_readfirstlane_b32 s14, v27 -; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_lshr_b64 s[88:89], s[90:91], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[90:91], 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v29, v20, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s8, v29 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 +; SI-NEXT: s_mov_b32 s77, s74 +; SI-NEXT: s_lshr_b32 s28, s74, 8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_alignbit_b32 v61, v11, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_readfirstlane_b32 s77, v61 -; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[76:77], 8 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v18 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s78, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[48:49], s[78:79], 16 +; SI-NEXT: s_mov_b32 s93, s48 +; SI-NEXT: s_lshr_b32 s27, s48, 8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v59, v36, v3, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s63, v59 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v20 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s56, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; SI-NEXT: v_alignbit_b32 v41, v49, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s57, v41 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 +; SI-NEXT: s_lshr_b64 s[30:31], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s6, v24 +; SI-NEXT: s_lshr_b32 s89, s6, 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_mov_b32 s51, s94 +; SI-NEXT: s_lshr_b32 s44, s94, 8 +; SI-NEXT: s_mov_b32 s56, s42 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v24 ; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s88, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 +; SI-NEXT: s_mov_b32 s31, s82 +; SI-NEXT: s_lshr_b32 s43, s82, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_lshr_b32 s37, s6, 16 +; SI-NEXT: s_mov_b32 s88, vcc_lo ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_alignbit_b32 v58, v32, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s43, v58 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v59 +; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 +; SI-NEXT: s_mov_b32 s53, s98 +; SI-NEXT: s_lshr_b32 s58, s98, 8 +; SI-NEXT: s_mov_b32 s36, s26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s90, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_lshr_b64 s[66:67], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_lshr_b32 s91, s6, 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 +; SI-NEXT: s_mov_b32 s67, s38 +; SI-NEXT: s_lshr_b32 s75, s38, 8 +; SI-NEXT: s_mov_b32 s90, s74 +; SI-NEXT: s_lshr_b32 s74, s54, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v47, v45, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s27, v47 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[8:9], 16 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s68, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_lshr_b32 s69, s6, 16 +; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 +; SI-NEXT: s_mov_b32 s81, s70 +; SI-NEXT: s_lshr_b32 s72, s70, 8 +; SI-NEXT: s_mov_b32 s68, s60 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s64, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v19, v11, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s21, v19 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b64 s[86:87], s[8:9], 16 +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_lshr_b32 s65, s6, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 +; SI-NEXT: s_mov_b32 s87, s84 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; SI-NEXT: s_lshr_b32 s61, s84, 8 +; SI-NEXT: s_mov_b32 s64, s48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v14, v52, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s15, v14 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: s_lshr_b32 s7, s6, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v11, v44, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: v_writelane_b32 v62, s7, 1 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[90:91], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[30:31], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[38:39], 24 -; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 +; SI-NEXT: s_mov_b32 s9, s96 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 4 +; SI-NEXT: v_writelane_b32 v62, s15, 5 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 2 +; SI-NEXT: v_writelane_b32 v62, s15, 3 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 0 +; SI-NEXT: v_writelane_b32 v62, s15, 1 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 10 +; SI-NEXT: v_writelane_b32 v62, s15, 11 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 8 +; SI-NEXT: v_writelane_b32 v62, s15, 9 +; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 6 +; SI-NEXT: v_writelane_b32 v62, s15, 7 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 16 +; SI-NEXT: v_writelane_b32 v62, s15, 17 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 14 +; SI-NEXT: v_writelane_b32 v62, s15, 15 +; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 12 +; SI-NEXT: v_writelane_b32 v62, s15, 13 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 22 +; SI-NEXT: v_writelane_b32 v62, s15, 23 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 20 +; SI-NEXT: v_writelane_b32 v62, s15, 21 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 18 +; SI-NEXT: v_writelane_b32 v62, s15, 19 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 28 +; SI-NEXT: v_writelane_b32 v62, s15, 29 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 26 +; SI-NEXT: v_writelane_b32 v62, s15, 27 +; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 24 +; SI-NEXT: v_writelane_b32 v62, s15, 25 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 34 +; SI-NEXT: v_writelane_b32 v62, s15, 35 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 32 +; SI-NEXT: v_writelane_b32 v62, s15, 33 +; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 30 +; SI-NEXT: v_writelane_b32 v62, s15, 31 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 40 +; SI-NEXT: v_writelane_b32 v62, s15, 41 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 38 +; SI-NEXT: v_writelane_b32 v62, s15, 39 +; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 36 +; SI-NEXT: v_writelane_b32 v62, s15, 37 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 46 +; SI-NEXT: v_writelane_b32 v62, s15, 47 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 44 +; SI-NEXT: v_writelane_b32 v62, s15, 45 +; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 42 +; SI-NEXT: v_writelane_b32 v62, s15, 43 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 52 +; SI-NEXT: v_writelane_b32 v62, s15, 53 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 50 +; SI-NEXT: v_writelane_b32 v62, s15, 51 +; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 48 +; SI-NEXT: v_writelane_b32 v62, s15, 49 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 24 +; SI-NEXT: v_writelane_b32 v62, s14, 58 +; SI-NEXT: v_writelane_b32 v62, s15, 59 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 56 +; SI-NEXT: v_writelane_b32 v62, s15, 57 +; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 54 +; SI-NEXT: v_writelane_b32 v62, s15, 55 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 0 +; SI-NEXT: v_writelane_b32 v61, s15, 1 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v62, s14, 62 +; SI-NEXT: v_writelane_b32 v62, s15, 63 +; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v62, s14, 60 +; SI-NEXT: v_writelane_b32 v62, s15, 61 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 6 +; SI-NEXT: v_writelane_b32 v61, s15, 7 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 4 +; SI-NEXT: v_writelane_b32 v61, s15, 5 +; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 2 +; SI-NEXT: v_writelane_b32 v61, s15, 3 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 12 +; SI-NEXT: v_writelane_b32 v61, s15, 13 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 10 +; SI-NEXT: v_writelane_b32 v61, s15, 11 +; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 8 +; SI-NEXT: v_writelane_b32 v61, s15, 9 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 18 +; SI-NEXT: v_writelane_b32 v61, s15, 19 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 16 +; SI-NEXT: v_writelane_b32 v61, s15, 17 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 14 +; SI-NEXT: v_writelane_b32 v61, s15, 15 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 24 +; SI-NEXT: v_writelane_b32 v61, s15, 25 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 22 +; SI-NEXT: v_writelane_b32 v61, s15, 23 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 20 +; SI-NEXT: v_writelane_b32 v61, s15, 21 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v61, s14, 32 +; SI-NEXT: v_writelane_b32 v61, s15, 33 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v61, s14, 30 +; SI-NEXT: v_writelane_b32 v61, s15, 31 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v61, s14, 28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v5 +; SI-NEXT: v_writelane_b32 v61, s15, 29 +; SI-NEXT: s_lshr_b32 s78, s96, 8 +; SI-NEXT: s_lshr_b32 s15, s26, 8 +; SI-NEXT: s_mov_b32 s14, s20 +; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s6, 0xff -; SI-NEXT: v_readlane_b32 s6, v62, 0 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s8, 0xff +; SI-NEXT: v_readlane_b32 s8, v62, 0 +; SI-NEXT: v_readlane_b32 s9, v62, 1 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 2 +; SI-NEXT: v_readlane_b32 s9, v62, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, vcc_lo, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_and_b32 s5, s96, 0xff +; SI-NEXT: s_lshl_b32 s8, s78, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s7, 0xff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s16, 8 -; SI-NEXT: s_lshl_b32 s6, s8, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v26 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_lshl_b32 s4, s4, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 1 -; SI-NEXT: v_readlane_b32 s99, v63, 33 -; SI-NEXT: v_readlane_b32 s97, v63, 31 -; SI-NEXT: v_readlane_b32 s87, v63, 29 -; SI-NEXT: v_readlane_b32 s85, v63, 27 -; SI-NEXT: v_readlane_b32 s83, v63, 25 -; SI-NEXT: v_readlane_b32 s81, v63, 23 -; SI-NEXT: v_readlane_b32 s71, v63, 21 -; SI-NEXT: v_readlane_b32 s69, v63, 19 -; SI-NEXT: v_readlane_b32 s67, v63, 17 -; SI-NEXT: v_readlane_b32 s65, v63, 15 -; SI-NEXT: v_readlane_b32 s55, v63, 13 -; SI-NEXT: v_readlane_b32 s53, v63, 11 -; SI-NEXT: v_readlane_b32 s51, v63, 9 -; SI-NEXT: v_readlane_b32 s49, v63, 7 -; SI-NEXT: v_readlane_b32 s39, v63, 5 -; SI-NEXT: v_readlane_b32 s37, v63, 3 -; SI-NEXT: v_readlane_b32 s35, v63, 1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v48 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 6 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s5, s86, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 7 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 9 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, vcc_lo, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s84, 0xff +; SI-NEXT: s_lshl_b32 s8, s61, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s65, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 12 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s80, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 13 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 14 +; SI-NEXT: v_readlane_b32 s9, v62, 15 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s8, s72, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s22, 8 -; SI-NEXT: s_lshl_b32 s6, s14, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s69, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 18 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 19 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 17 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 20 +; SI-NEXT: v_readlane_b32 s9, v62, 21 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 22 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s28, 8 -; SI-NEXT: s_lshl_b32 s6, s20, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s38, 0xff +; SI-NEXT: s_lshl_b32 s8, s75, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s91, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 24 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s52, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 25 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 23 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 26 +; SI-NEXT: v_readlane_b32 s9, v62, 27 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 28 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s98, 0xff +; SI-NEXT: s_lshl_b32 s8, s58, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_lshl_b32 s6, s26, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s37, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 30 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s30, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 29 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 32 +; SI-NEXT: v_readlane_b32 s9, v62, 33 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s60, v62, 34 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s60, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s82, 0xff +; SI-NEXT: s_lshl_b32 s8, s43, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: s_lshl_b32 s6, s42, 24 -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s89, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 36 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s50, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 37 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 38 +; SI-NEXT: v_readlane_b32 s9, v62, 39 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 40 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s42, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: s_lshl_b32 s6, s56, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xff +; SI-NEXT: s_lshl_b32 s8, s44, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s57, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 42 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s92, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 43 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s43, v62, 41 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 44 +; SI-NEXT: v_readlane_b32 s9, v62, 45 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 46 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s42, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: s_lshl_b32 s6, s62, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s64, 0xff +; SI-NEXT: s_lshl_b32 s8, s27, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s79, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 48 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 49 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 50 +; SI-NEXT: v_readlane_b32 s9, v62, 51 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 52 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: s_lshl_b32 s8, s28, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s73, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v55 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 54 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 55 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 53 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 56 +; SI-NEXT: v_readlane_b32 s9, v62, 57 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 58 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s34, 8 -; SI-NEXT: s_lshl_b32 s6, s90, 24 -; SI-NEXT: v_readlane_b32 s34, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s54, 0xff +; SI-NEXT: s_lshl_b32 s8, s74, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s59, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v62, 60 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 61 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 59 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v62, 62 +; SI-NEXT: v_readlane_b32 s9, v62, 63 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v61, 0 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s38, 8 -; SI-NEXT: s_lshl_b32 s6, s30, 24 -; SI-NEXT: v_readlane_b32 s30, v63, 34 -; SI-NEXT: v_readlane_b32 s31, v63, 35 -; SI-NEXT: v_readlane_b32 s38, v63, 4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s68, 0xff +; SI-NEXT: s_lshl_b32 s8, s22, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s45, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 2 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s40, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 3 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v61, 1 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 4 +; SI-NEXT: v_readlane_b32 s9, v61, 5 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s26, v61, 6 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s6, s48, 24 -; SI-NEXT: v_readlane_b32 s52, v63, 10 -; SI-NEXT: v_readlane_b32 s48, v63, 6 -; SI-NEXT: v_readlane_b32 s36, v63, 2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s29, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 8 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 9 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 10 +; SI-NEXT: v_readlane_b32 s9, v61, 11 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s20, v61, 12 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s20, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: s_lshl_b32 s6, s54, 24 -; SI-NEXT: v_readlane_b32 s68, v63, 18 -; SI-NEXT: v_readlane_b32 s54, v63, 12 -; SI-NEXT: v_readlane_b32 s50, v63, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s34, 0xff +; SI-NEXT: s_lshl_b32 s8, s18, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s64, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s23, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v56 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 14 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 15 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 16 +; SI-NEXT: v_readlane_b32 s9, v61, 17 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s16, v61, 18 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s16, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_lshl_b32 s5, s82, 8 -; SI-NEXT: s_lshl_b32 s6, s66, 24 -; SI-NEXT: v_readlane_b32 s82, v63, 24 -; SI-NEXT: v_readlane_b32 s66, v63, 16 -; SI-NEXT: v_readlane_b32 s64, v63, 14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: s_lshl_b32 s8, s15, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s8, s19, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s96, 8 -; SI-NEXT: s_lshl_b32 s6, s80, 24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s30, v63, 34 +; SI-NEXT: v_readlane_b32 s61, v62, 35 +; SI-NEXT: v_readlane_b32 s43, v62, 47 +; SI-NEXT: v_readlane_b32 s27, v61, 7 +; SI-NEXT: v_readlane_b32 s21, v61, 13 +; SI-NEXT: v_readlane_b32 s17, v61, 19 +; SI-NEXT: v_readlane_b32 s31, v63, 35 +; SI-NEXT: v_readlane_b32 s99, v63, 33 +; SI-NEXT: v_readlane_b32 s98, v63, 32 +; SI-NEXT: v_readlane_b32 s97, v63, 31 ; SI-NEXT: v_readlane_b32 s96, v63, 30 +; SI-NEXT: v_readlane_b32 s87, v63, 29 +; SI-NEXT: v_readlane_b32 s86, v63, 28 +; SI-NEXT: v_readlane_b32 s85, v63, 27 +; SI-NEXT: v_readlane_b32 s84, v63, 26 +; SI-NEXT: v_readlane_b32 s83, v63, 25 +; SI-NEXT: v_readlane_b32 s82, v63, 24 +; SI-NEXT: v_readlane_b32 s81, v63, 23 ; SI-NEXT: v_readlane_b32 s80, v63, 22 +; SI-NEXT: v_readlane_b32 s71, v63, 21 ; SI-NEXT: v_readlane_b32 s70, v63, 20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_readlane_b32 s69, v63, 19 +; SI-NEXT: v_readlane_b32 s68, v63, 18 +; SI-NEXT: v_readlane_b32 s67, v63, 17 +; SI-NEXT: v_readlane_b32 s66, v63, 16 +; SI-NEXT: v_readlane_b32 s65, v63, 15 +; SI-NEXT: v_readlane_b32 s64, v63, 14 +; SI-NEXT: v_readlane_b32 s55, v63, 13 +; SI-NEXT: v_readlane_b32 s54, v63, 12 +; SI-NEXT: v_readlane_b32 s53, v63, 11 +; SI-NEXT: v_readlane_b32 s52, v63, 10 +; SI-NEXT: v_readlane_b32 s51, v63, 9 +; SI-NEXT: v_readlane_b32 s50, v63, 8 +; SI-NEXT: v_readlane_b32 s49, v63, 7 +; SI-NEXT: v_readlane_b32 s48, v63, 6 +; SI-NEXT: v_readlane_b32 s39, v63, 5 +; SI-NEXT: v_readlane_b32 s38, v63, 4 +; SI-NEXT: v_readlane_b32 s37, v63, 3 +; SI-NEXT: v_readlane_b32 s36, v63, 2 +; SI-NEXT: v_readlane_b32 s35, v63, 1 +; SI-NEXT: v_readlane_b32 s34, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 20 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 21 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 22 +; SI-NEXT: v_readlane_b32 s9, v61, 23 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s10, v61, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s5, s14, 0xff +; SI-NEXT: s_lshl_b32 s8, s12, 8 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_readlane_b32 s8, v61, 26 +; SI-NEXT: v_readlane_b32 s9, v61, 27 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_readlane_b32 s11, v61, 25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s8, v1 +; SI-NEXT: v_readlane_b32 s8, v61, 28 +; SI-NEXT: v_readlane_b32 s9, v61, 29 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s84, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_readlane_b32 s8, v61, 30 +; SI-NEXT: v_readlane_b32 s9, v61, 31 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s8, 0xff +; SI-NEXT: v_readlane_b32 s8, v61, 32 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s5, s86, 24 -; SI-NEXT: v_readlane_b32 s86, v63, 28 -; SI-NEXT: v_readlane_b32 s84, v63, 26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_readlane_b32 s9, v61, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s98, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 -; SI-NEXT: v_readlane_b32 s98, v63, 32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -170596,23 +171643,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_writelane_b32 v63, s34, 0 ; VI-NEXT: v_writelane_b32 v63, s35, 1 ; VI-NEXT: v_writelane_b32 v63, s36, 2 @@ -170646,195 +171693,212 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_writelane_b32 v63, s30, 30 ; VI-NEXT: v_writelane_b32 v63, s31, 31 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_readfirstlane_b32 s44, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s42, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s40, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s14, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s12, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s10, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s8, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s6, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s48, v3 +; VI-NEXT: v_readfirstlane_b32 s49, v4 +; VI-NEXT: v_readfirstlane_b32 s38, v5 +; VI-NEXT: v_readfirstlane_b32 s39, v6 +; VI-NEXT: v_readfirstlane_b32 s36, v7 +; VI-NEXT: v_readfirstlane_b32 s37, v8 +; VI-NEXT: v_readfirstlane_b32 s34, v9 +; VI-NEXT: v_readfirstlane_b32 s35, v10 +; VI-NEXT: v_readfirstlane_b32 s30, v11 +; VI-NEXT: v_readfirstlane_b32 s31, v12 +; VI-NEXT: v_readfirstlane_b32 s90, v13 +; VI-NEXT: v_readfirstlane_b32 s91, v14 +; VI-NEXT: v_readfirstlane_b32 s88, v15 +; VI-NEXT: v_readfirstlane_b32 s89, v16 +; VI-NEXT: v_readfirstlane_b32 s76, v17 +; VI-NEXT: v_readfirstlane_b32 s77, v18 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane ; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB91_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s20, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s20, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s19, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s18, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s17, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s17, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s17, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s16, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s16, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: s_lshr_b32 s80, s13, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 0 -; VI-NEXT: s_lshr_b32 s81, s12, 8 -; VI-NEXT: s_lshr_b32 s82, s15, 24 -; VI-NEXT: s_lshr_b32 s83, s15, 16 -; VI-NEXT: s_lshr_b32 s85, s15, 8 -; VI-NEXT: s_lshr_b32 s84, s14, 16 -; VI-NEXT: s_lshr_b32 s86, s14, 8 -; VI-NEXT: s_lshr_b32 s87, s41, 24 -; VI-NEXT: s_lshr_b32 s50, s41, 16 -; VI-NEXT: s_lshr_b32 s52, s41, 8 -; VI-NEXT: s_lshr_b32 s51, s40, 16 -; VI-NEXT: s_lshr_b32 s53, s40, 8 -; VI-NEXT: s_lshr_b32 s54, s43, 24 -; VI-NEXT: s_lshr_b32 s55, s43, 16 -; VI-NEXT: s_lshr_b32 s65, s43, 8 -; VI-NEXT: s_lshr_b32 s64, s42, 16 -; VI-NEXT: s_lshr_b32 s66, s42, 8 -; VI-NEXT: s_lshr_b32 s67, s45, 24 -; VI-NEXT: s_lshr_b32 s68, s45, 16 -; VI-NEXT: s_lshr_b32 s70, s45, 8 -; VI-NEXT: s_lshr_b32 s69, s44, 16 -; VI-NEXT: s_lshr_b32 s71, s44, 8 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_lshr_b32 s6, s5, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 26 +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 27 +; VI-NEXT: s_lshr_b32 s6, s5, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 28 +; VI-NEXT: s_lshr_b32 s6, s4, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 29 +; VI-NEXT: s_lshr_b32 s6, s4, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 30 +; VI-NEXT: s_lshr_b32 s6, s29, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 31 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 32 +; VI-NEXT: s_lshr_b32 s6, s29, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 33 +; VI-NEXT: s_lshr_b32 s6, s28, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 34 +; VI-NEXT: s_lshr_b32 s6, s28, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 35 +; VI-NEXT: s_lshr_b32 s6, s27, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 36 +; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 37 +; VI-NEXT: s_lshr_b32 s6, s27, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 38 +; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 39 +; VI-NEXT: s_lshr_b32 s6, s26, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 40 +; VI-NEXT: s_lshr_b32 s6, s25, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 41 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 42 +; VI-NEXT: s_lshr_b32 s6, s25, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 43 +; VI-NEXT: s_lshr_b32 s6, s24, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 44 +; VI-NEXT: s_lshr_b32 s6, s24, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 45 +; VI-NEXT: s_lshr_b32 s6, s23, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 46 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 47 +; VI-NEXT: s_lshr_b32 s6, s23, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 48 +; VI-NEXT: s_lshr_b32 s6, s22, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 49 +; VI-NEXT: s_lshr_b32 s6, s22, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 50 +; VI-NEXT: s_lshr_b32 s6, s21, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 51 +; VI-NEXT: s_lshr_b32 s6, s21, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 52 +; VI-NEXT: s_lshr_b32 s6, s21, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 53 +; VI-NEXT: s_lshr_b32 s6, s20, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 54 +; VI-NEXT: s_lshr_b32 s6, s20, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 55 +; VI-NEXT: s_lshr_b32 s6, s19, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 56 +; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 57 +; VI-NEXT: s_lshr_b32 s6, s19, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 58 +; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 59 +; VI-NEXT: s_lshr_b32 s6, s18, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 60 +; VI-NEXT: s_lshr_b32 s6, s17, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 61 +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 62 +; VI-NEXT: s_lshr_b32 s6, s17, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 63 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 0 +; VI-NEXT: s_lshr_b32 s6, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 1 +; VI-NEXT: s_lshr_b32 s6, s39, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 18 +; VI-NEXT: s_lshr_b32 s6, s39, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 19 +; VI-NEXT: s_lshr_b32 s6, s39, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 20 +; VI-NEXT: s_lshr_b32 s6, s38, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 16 +; VI-NEXT: s_lshr_b32 s6, s38, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 17 +; VI-NEXT: s_lshr_b32 s6, s49, 24 +; VI-NEXT: v_writelane_b32 v61, s6, 23 +; VI-NEXT: s_lshr_b32 s6, s49, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 24 +; VI-NEXT: s_lshr_b32 s6, s49, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 25 +; VI-NEXT: s_lshr_b32 s6, s48, 16 +; VI-NEXT: v_writelane_b32 v61, s6, 21 +; VI-NEXT: s_lshr_b32 s6, s48, 8 +; VI-NEXT: v_writelane_b32 v61, s6, 22 +; VI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 14 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 15 +; VI-NEXT: s_lshr_b64 vcc, s[28:29], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 12 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 13 +; VI-NEXT: s_lshr_b64 vcc, s[26:27], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 10 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 11 +; VI-NEXT: s_lshr_b64 vcc, s[24:25], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 8 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 9 +; VI-NEXT: s_lshr_b64 vcc, s[22:23], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 6 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 7 +; VI-NEXT: s_lshr_b64 vcc, s[20:21], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 4 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 5 +; VI-NEXT: s_lshr_b64 vcc, s[18:19], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 2 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 3 +; VI-NEXT: s_lshr_b64 vcc, s[16:17], 24 +; VI-NEXT: v_writelane_b32 v61, vcc_lo, 0 +; VI-NEXT: s_lshr_b32 s87, s77, 24 +; VI-NEXT: s_lshr_b32 s43, s77, 16 +; VI-NEXT: s_lshr_b32 s42, s77, 8 +; VI-NEXT: s_lshr_b32 s13, s76, 16 +; VI-NEXT: s_lshr_b32 s11, s76, 8 +; VI-NEXT: s_lshr_b32 s86, s89, 24 +; VI-NEXT: s_lshr_b32 s85, s89, 16 +; VI-NEXT: s_lshr_b32 s84, s89, 8 +; VI-NEXT: s_lshr_b32 s9, s88, 16 +; VI-NEXT: s_lshr_b32 s7, s88, 8 +; VI-NEXT: s_lshr_b32 s75, s91, 24 +; VI-NEXT: s_lshr_b32 s74, s91, 16 +; VI-NEXT: s_lshr_b32 s73, s91, 8 +; VI-NEXT: s_lshr_b32 s79, s90, 16 +; VI-NEXT: s_lshr_b32 s78, s90, 8 +; VI-NEXT: s_lshr_b32 s60, s31, 24 +; VI-NEXT: s_lshr_b32 s15, s31, 16 +; VI-NEXT: s_lshr_b32 s14, s31, 8 +; VI-NEXT: s_lshr_b32 s72, s30, 16 +; VI-NEXT: s_lshr_b32 s61, s30, 8 +; VI-NEXT: s_lshr_b32 s63, s35, 24 +; VI-NEXT: s_lshr_b32 s57, s35, 16 +; VI-NEXT: s_lshr_b32 s56, s35, 8 +; VI-NEXT: s_lshr_b32 s83, s34, 16 +; VI-NEXT: s_lshr_b32 s82, s34, 8 +; VI-NEXT: s_lshr_b32 s41, s37, 24 +; VI-NEXT: s_lshr_b32 s47, s37, 16 +; VI-NEXT: s_lshr_b32 s46, s37, 8 +; VI-NEXT: s_lshr_b32 s59, s36, 16 +; VI-NEXT: s_lshr_b32 s45, s36, 8 +; VI-NEXT: v_writelane_b32 v61, vcc_hi, 1 +; VI-NEXT: s_lshr_b64 s[50:51], s[76:77], 24 +; VI-NEXT: s_lshr_b64 s[52:53], s[88:89], 24 +; VI-NEXT: s_lshr_b64 s[54:55], s[90:91], 24 +; VI-NEXT: s_lshr_b64 s[64:65], s[30:31], 24 +; VI-NEXT: s_lshr_b64 s[66:67], s[34:35], 24 +; VI-NEXT: s_lshr_b64 s[68:69], s[36:37], 24 +; VI-NEXT: s_lshr_b64 s[70:71], s[38:39], 24 +; VI-NEXT: s_lshr_b64 s[80:81], s[48:49], 24 +; VI-NEXT: s_mov_b32 s6, s17 +; VI-NEXT: s_mov_b32 s8, s19 +; VI-NEXT: s_mov_b32 s10, s21 +; VI-NEXT: s_mov_b32 s12, s23 +; VI-NEXT: s_mov_b32 s40, s25 +; VI-NEXT: s_mov_b32 s44, s27 +; VI-NEXT: s_mov_b32 s58, s29 +; VI-NEXT: s_mov_b32 s62, s5 ; VI-NEXT: s_cbranch_execnz .LBB91_4 ; VI-NEXT: .LBB91_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s46, s45, 16 -; VI-NEXT: v_mov_b32_e32 v31, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s46, v31 +; VI-NEXT: s_lshl_b32 s6, s49, 16 +; VI-NEXT: v_mov_b32_e32 v25, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s6, v25 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s45, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s49, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s45, v31 +; VI-NEXT: v_add_f32_e32 v2, s6, v25 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -170842,53 +171906,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s45, s44, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s45, v31 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[1:2] +; VI-NEXT: s_lshl_b32 s6, s48, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v25 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s44, s44, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s44, v31 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s44, s43, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s6, s48, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v25 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s43, s43, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s39, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s43, v31 +; VI-NEXT: v_add_f32_e32 v4, s6, v25 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s6, s39, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s43, s42, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s43, v31 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s42, s42, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_add_f32_e32 v5, s6, v25 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 @@ -170896,53 +171940,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s42, s41, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: s_lshl_b32 s6, s38, 16 +; VI-NEXT: v_add_f32_e32 v5, s6, v25 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s41, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s38, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s41, v31 +; VI-NEXT: v_add_f32_e32 v6, s6, v25 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: s_lshl_b32 s6, s37, 16 ; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s41, s40, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s41, v31 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s40, s40, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s40, v31 +; VI-NEXT: v_add_f32_e32 v7, s6, v25 ; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s6, s37, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s40, s15, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s40, v31 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s15, v31 +; VI-NEXT: v_add_f32_e32 v8, s6, v25 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 @@ -170950,53 +171974,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s15, s14, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s15, v31 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[7:8] +; VI-NEXT: s_lshl_b32 s6, s36, 16 +; VI-NEXT: v_add_f32_e32 v8, s6, v25 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s14, v31 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s14, s13, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s14, v31 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: s_and_b32 s6, s36, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_add_f32_e32 v9, s6, v25 ; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 ; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s35, 16 ; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s13, v31 +; VI-NEXT: v_add_f32_e32 v10, s6, v25 ; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 ; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: s_and_b32 s6, s35, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s13, s12, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s13, v31 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_add_f32_e32 v11, s6, v25 ; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 @@ -171004,53 +172008,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s12, s11, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] +; VI-NEXT: s_lshl_b32 s6, s34, 16 +; VI-NEXT: v_add_f32_e32 v11, s6, v25 ; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 ; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s34, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s11, v31 +; VI-NEXT: v_add_f32_e32 v12, s6, v25 ; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: s_lshl_b32 s6, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s11, s10, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s11, v31 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s10, v31 +; VI-NEXT: v_add_f32_e32 v13, s6, v25 ; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s6, s31, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s10, s9, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s10, v31 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_add_f32_e32 v14, s9, v31 +; VI-NEXT: v_add_f32_e32 v14, s6, v25 ; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 @@ -171058,53 +172042,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s9, s8, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s9, v31 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: s_lshl_b32 s6, s30, 16 +; VI-NEXT: v_add_f32_e32 v14, s6, v25 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; VI-NEXT: v_add_f32_e32 v15, s8, v31 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: s_lshl_b32 s8, s7, 16 -; VI-NEXT: v_alignbit_b32 v13, v15, v13, 16 -; VI-NEXT: v_add_f32_e32 v15, s8, v31 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: s_and_b32 s6, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_add_f32_e32 v15, s6, v25 ; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s91, 16 ; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s7, v31 +; VI-NEXT: v_add_f32_e32 v16, s6, v25 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b32 s6, s91, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s7, s6, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v15, 16 -; VI-NEXT: v_add_f32_e32 v15, s7, v31 -; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_add_f32_e32 v17, s6, v25 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 @@ -171112,53 +172076,33 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_alignbit_b32 v15, v17, v15, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: s_lshl_b32 s6, s90, 16 +; VI-NEXT: v_add_f32_e32 v17, s6, v25 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s90, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_add_f32_e32 v18, s6, v31 +; VI-NEXT: v_add_f32_e32 v18, s6, v25 ; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 ; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: s_lshl_b32 s6, s89, 16 ; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v18, v18, v17, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v31 -; VI-NEXT: v_bfe_u32 v19, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v17 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v17, v19, v17, 16 -; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_add_f32_e32 v19, s6, v25 ; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 ; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s89, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_add_f32_e32 v20, s6, v31 +; VI-NEXT: v_add_f32_e32 v20, s6, v25 ; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 @@ -171166,863 +172110,1089 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v19, 16 -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b64 v[19:20], 16, v[19:20] +; VI-NEXT: s_lshl_b32 s6, s88, 16 +; VI-NEXT: v_add_f32_e32 v20, s6, v25 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v19, v21, v22, vcc -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v19, v21, v19, 16 -; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: s_and_b32 s6, s88, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; VI-NEXT: v_add_f32_e32 v21, s6, v25 ; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 ; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s6, s77, 16 ; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_add_f32_e32 v22, s6, v31 +; VI-NEXT: v_add_f32_e32 v22, s6, v25 ; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 ; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: s_and_b32 s6, s77, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v21, 16 -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc -; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_add_f32_e32 v23, s6, v25 ; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v26, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v21, v23, v21, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: s_lshl_b32 s6, s76, 16 +; VI-NEXT: v_add_f32_e32 v23, s6, v25 ; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; VI-NEXT: v_add_f32_e32 v24, s6, v31 -; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v23, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v31 -; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v25, v26, vcc -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: s_and_b32 s6, s76, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v26, vcc +; VI-NEXT: v_add_f32_e32 v24, s6, v25 +; VI-NEXT: v_bfe_u32 v26, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v24 ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v23, v25, v23, 16 -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_add_f32_e32 v26, s6, v31 -; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v24, v26, v27, vcc +; VI-NEXT: v_add_f32_e32 v26, s6, v25 +; VI-NEXT: v_readfirstlane_b32 s6, v26 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v25, 16 -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v25, v27, v28, vcc -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v25, v27, v25, 16 -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; VI-NEXT: v_add_f32_e32 v28, s6, v31 -; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v27, 16 -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v27, v29, v30, vcc -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v27, v29, v27, 16 -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc -; VI-NEXT: v_add_f32_e32 v30, s6, v31 -; VI-NEXT: v_bfe_u32 v32, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v30 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v29, 16 -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v32, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v29 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc -; VI-NEXT: v_add_f32_e32 v32, s6, v31 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v29, v32, v29, 16 -; VI-NEXT: v_add_f32_e32 v32, s6, v31 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s9, s8 +; VI-NEXT: s_and_b32 s7, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s10 +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; VI-NEXT: s_lshl_b32 s7, s16, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s19, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s18, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s21, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[18:19], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s21, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s20, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s20, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s23, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s22, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s22, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s25, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s25, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s24, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[40:41], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s24, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s27, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s27, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s26, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s26, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s29, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s29, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s28, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s28, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s5, 16 +; VI-NEXT: v_add_f32_e32 v26, s7, v25 +; VI-NEXT: v_readfirstlane_b32 s7, v26 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[28:29], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_add_f32_e32 v33, s5, v31 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_add_f32_e32 v26, s5, v25 +; VI-NEXT: v_readfirstlane_b32 s5, v26 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s7 +; VI-NEXT: s_lshr_b32 s15, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v32, v33, v32, 16 -; VI-NEXT: v_add_f32_e32 v33, s5, v31 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_add_f32_e32 v26, s5, v25 +; VI-NEXT: v_readfirstlane_b32 s5, v26 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[23:24] +; VI-NEXT: s_cselect_b32 s14, s5, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v31, s4, v31 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v24, v22 +; VI-NEXT: v_add_f32_e32 v25, s4, v25 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_readfirstlane_b32 s4, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v18, v16 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] +; VI-NEXT: v_mov_b32_e32 v15, v13 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[11:12], 16, v[11:12] +; VI-NEXT: s_add_i32 s5, s5, s4 +; VI-NEXT: v_mov_b32_e32 v12, v10 +; VI-NEXT: s_add_i32 s7, s5, 0x7fff +; VI-NEXT: s_or_b32 s9, s4, 0x400000 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] +; VI-NEXT: s_cselect_b32 s4, s9, s7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6] +; VI-NEXT: v_mov_b32_e32 v9, v7 +; VI-NEXT: s_lshr_b32 s15, s4, 16 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 16 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[8:9] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_mov_b32 s27, s44 +; VI-NEXT: s_mov_b32 s29, s58 +; VI-NEXT: s_mov_b32 s5, s62 +; VI-NEXT: v_lshrrev_b64 v[30:31], 24, v[5:6] +; VI-NEXT: s_mov_b32 s17, s6 +; VI-NEXT: s_mov_b32 s19, s8 +; VI-NEXT: s_mov_b32 s21, s10 +; VI-NEXT: s_mov_b32 s23, s12 +; VI-NEXT: s_mov_b32 s25, s40 +; VI-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[26:27], 24 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[2:3] +; VI-NEXT: s_lshr_b64 s[36:37], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[50:51], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[52:53], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s11, s62, 24 +; VI-NEXT: s_lshr_b32 s13, s62, 16 +; VI-NEXT: s_lshr_b32 s14, s62, 8 +; VI-NEXT: s_lshr_b32 s15, s4, 16 +; VI-NEXT: s_lshr_b32 s17, s4, 8 +; VI-NEXT: s_lshr_b32 s19, s58, 24 +; VI-NEXT: s_lshr_b32 s21, s58, 16 +; VI-NEXT: s_lshr_b32 s23, s58, 8 +; VI-NEXT: s_lshr_b32 s25, s28, 16 +; VI-NEXT: s_lshr_b32 s27, s28, 8 +; VI-NEXT: s_lshr_b32 s29, s44, 24 +; VI-NEXT: s_lshr_b32 s41, s44, 16 +; VI-NEXT: s_lshr_b32 s42, s44, 8 +; VI-NEXT: s_lshr_b32 s43, s26, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 8 +; VI-NEXT: s_lshr_b32 s46, s40, 24 +; VI-NEXT: s_lshr_b32 s47, s40, 16 +; VI-NEXT: s_lshr_b32 s56, s40, 8 +; VI-NEXT: s_lshr_b32 s57, s24, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 8 +; VI-NEXT: s_lshr_b32 s60, s12, 24 +; VI-NEXT: s_lshr_b32 s61, s12, 16 +; VI-NEXT: s_lshr_b32 s63, s12, 8 +; VI-NEXT: s_lshr_b32 s72, s22, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 8 +; VI-NEXT: s_lshr_b32 s75, s10, 24 +; VI-NEXT: s_lshr_b32 s76, s10, 16 +; VI-NEXT: s_lshr_b32 s77, s10, 8 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s20, 8 +; VI-NEXT: s_lshr_b32 s89, s8, 24 +; VI-NEXT: s_lshr_b32 s90, s8, 16 +; VI-NEXT: s_lshr_b32 s91, s8, 8 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s18, 8 +; VI-NEXT: s_lshr_b32 vcc_lo, s6, 24 +; VI-NEXT: s_lshr_b32 vcc_hi, s6, 16 +; VI-NEXT: s_lshr_b32 s35, s6, 8 +; VI-NEXT: s_lshr_b32 s9, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v18, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2 -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v13 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; VI-NEXT: s_branch .LBB91_5 ; VI-NEXT: .LBB91_3: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr71 -; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr70 ; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr64 -; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr51 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: v_writelane_b32 v61, s6, 0 +; VI-NEXT: v_writelane_b32 v61, s7, 1 +; VI-NEXT: v_writelane_b32 v61, s8, 2 +; VI-NEXT: v_writelane_b32 v61, s9, 3 +; VI-NEXT: v_writelane_b32 v61, s10, 4 +; VI-NEXT: v_writelane_b32 v61, s11, 5 +; VI-NEXT: v_writelane_b32 v61, s12, 6 +; VI-NEXT: v_writelane_b32 v61, s13, 7 +; VI-NEXT: v_writelane_b32 v61, s40, 8 +; VI-NEXT: v_writelane_b32 v61, s41, 9 +; VI-NEXT: v_writelane_b32 v61, s44, 10 +; VI-NEXT: v_writelane_b32 v61, s45, 11 +; VI-NEXT: v_writelane_b32 v61, s58, 12 +; VI-NEXT: v_writelane_b32 v61, s59, 13 +; VI-NEXT: v_writelane_b32 v61, s62, 14 +; VI-NEXT: v_writelane_b32 v61, s63, 15 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: s_branch .LBB91_2 ; VI-NEXT: .LBB91_4: -; VI-NEXT: v_mov_b32_e32 v33, s71 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s69 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s70 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s68 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s67 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s86 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s83 -; VI-NEXT: v_mov_b32_e32 v31, s4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s82 -; VI-NEXT: v_readlane_b32 s4, v62, 0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 1 -; VI-NEXT: v_mov_b32_e32 v40, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 2 -; VI-NEXT: v_mov_b32_e32 v44, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 3 -; VI-NEXT: v_mov_b32_e32 v54, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 4 -; VI-NEXT: v_mov_b32_e32 v53, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 5 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 6 -; VI-NEXT: v_mov_b32_e32 v51, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 7 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 8 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 9 -; VI-NEXT: v_mov_b32_e32 v56, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 10 -; VI-NEXT: v_mov_b32_e32 v47, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 11 -; VI-NEXT: v_mov_b32_e32 v48, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 12 -; VI-NEXT: v_mov_b32_e32 v43, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 13 -; VI-NEXT: v_mov_b32_e32 v46, s4 -; VI-NEXT: v_mov_b32_e32 v45, s72 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s74 -; VI-NEXT: v_mov_b32_e32 v42, s54 -; VI-NEXT: v_mov_b32_e32 v41, s46 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s56 -; VI-NEXT: v_readlane_b32 s4, v62, 14 -; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s76 -; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_mov_b32_e32 v36, s66 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s78 -; VI-NEXT: v_mov_b32_e32 v55, s88 -; VI-NEXT: v_mov_b32_e32 v35, s30 -; VI-NEXT: v_mov_b32_e32 v41, s58 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s85 -; VI-NEXT: v_mov_b32_e32 v34, s38 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v52, s64 -; VI-NEXT: v_mov_b32_e32 v59, s87 -; VI-NEXT: v_mov_b32_e32 v41, s60 -; VI-NEXT: v_mov_b32_e32 v55, v50 -; VI-NEXT: v_mov_b32_e32 v58, s34 -; VI-NEXT: v_mov_b32_e32 v45, s36 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v50, v46 -; VI-NEXT: v_mov_b32_e32 v46, v48 -; VI-NEXT: v_mov_b32_e32 v48, v47 -; VI-NEXT: v_mov_b32_e32 v47, v56 -; VI-NEXT: v_mov_b32_e32 v56, v51 -; VI-NEXT: v_mov_b32_e32 v51, s90 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, s48 -; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: v_mov_b32_e32 v2, s45 -; VI-NEXT: v_mov_b32_e32 v3, s42 -; VI-NEXT: v_mov_b32_e32 v4, s43 -; VI-NEXT: v_mov_b32_e32 v5, s40 -; VI-NEXT: v_mov_b32_e32 v6, s41 -; VI-NEXT: v_mov_b32_e32 v7, s14 -; VI-NEXT: v_mov_b32_e32 v8, s15 -; VI-NEXT: v_mov_b32_e32 v9, s12 -; VI-NEXT: v_mov_b32_e32 v10, s13 -; VI-NEXT: v_mov_b32_e32 v11, s10 -; VI-NEXT: v_mov_b32_e32 v12, s11 -; VI-NEXT: v_mov_b32_e32 v13, s8 -; VI-NEXT: v_mov_b32_e32 v14, s9 -; VI-NEXT: v_mov_b32_e32 v15, s6 -; VI-NEXT: v_mov_b32_e32 v16, s7 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v21, s20 -; VI-NEXT: v_mov_b32_e32 v22, s21 -; VI-NEXT: v_mov_b32_e32 v23, s22 -; VI-NEXT: v_mov_b32_e32 v24, s23 -; VI-NEXT: v_mov_b32_e32 v25, s24 -; VI-NEXT: v_mov_b32_e32 v26, s25 -; VI-NEXT: v_mov_b32_e32 v27, s26 -; VI-NEXT: v_mov_b32_e32 v28, s27 -; VI-NEXT: v_mov_b32_e32 v29, s28 -; VI-NEXT: v_mov_b32_e32 v30, s29 -; VI-NEXT: v_mov_b32_e32 v32, s5 -; VI-NEXT: v_mov_b32_e32 v41, s62 -; VI-NEXT: v_mov_b32_e32 v51, v53 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: v_mov_b32_e32 v54, v40 -; VI-NEXT: v_mov_b32_e32 v40, s80 -; VI-NEXT: v_mov_b32_e32 v57, s81 -; VI-NEXT: v_mov_b32_e32 v37, s84 -; VI-NEXT: v_mov_b32_e32 v58, s50 -; VI-NEXT: v_mov_b32_e32 v60, s52 -; VI-NEXT: v_mov_b32_e32 v38, s51 -; VI-NEXT: v_mov_b32_e32 v61, s65 -; VI-NEXT: v_mov_b32_e32 v49, s66 -; VI-NEXT: v_mov_b32_e32 v45, s53 -; VI-NEXT: v_mov_b32_e32 v39, s55 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s50 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s52 +; VI-NEXT: v_readlane_b32 s5, v61, 16 +; VI-NEXT: v_mov_b32_e32 v57, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 17 +; VI-NEXT: v_mov_b32_e32 v59, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 18 +; VI-NEXT: v_mov_b32_e32 v47, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 19 +; VI-NEXT: v_mov_b32_e32 v56, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 20 +; VI-NEXT: v_mov_b32_e32 v58, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 21 +; VI-NEXT: v_mov_b32_e32 v25, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 22 +; VI-NEXT: v_mov_b32_e32 v27, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 23 +; VI-NEXT: v_mov_b32_e32 v38, s79 +; VI-NEXT: v_mov_b32_e32 v39, s78 +; VI-NEXT: v_mov_b32_e32 v35, s75 +; VI-NEXT: v_mov_b32_e32 v36, s74 +; VI-NEXT: v_mov_b32_e32 v60, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 24 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v13, s31 +; VI-NEXT: v_readlane_b32 s74, v61, 14 +; VI-NEXT: v_readlane_b32 s78, v61, 12 +; VI-NEXT: v_readlane_b32 s30, v61, 10 +; VI-NEXT: v_mov_b32_e32 v24, s5 +; VI-NEXT: v_readlane_b32 s5, v61, 25 +; VI-NEXT: v_mov_b32_e32 v8, s36 +; VI-NEXT: v_mov_b32_e32 v7, s37 +; VI-NEXT: v_mov_b32_e32 v5, s38 +; VI-NEXT: v_mov_b32_e32 v4, s39 +; VI-NEXT: v_mov_b32_e32 v2, s48 +; VI-NEXT: v_mov_b32_e32 v1, s49 +; VI-NEXT: v_readlane_b32 s75, v61, 15 +; VI-NEXT: v_readlane_b32 s79, v61, 13 +; VI-NEXT: v_readlane_b32 s31, v61, 11 +; VI-NEXT: v_readlane_b32 s36, v61, 8 +; VI-NEXT: v_readlane_b32 s38, v61, 6 +; VI-NEXT: v_readlane_b32 s48, v61, 4 +; VI-NEXT: v_readlane_b32 s50, v61, 2 +; VI-NEXT: v_readlane_b32 s52, v61, 0 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v15, s11 +; VI-NEXT: v_mov_b32_e32 v3, s87 +; VI-NEXT: v_mov_b32_e32 v6, s43 +; VI-NEXT: v_mov_b32_e32 v9, s42 +; VI-NEXT: v_mov_b32_e32 v33, s9 +; VI-NEXT: v_mov_b32_e32 v34, s7 +; VI-NEXT: v_mov_b32_e32 v18, s86 +; VI-NEXT: v_mov_b32_e32 v21, s85 +; VI-NEXT: v_mov_b32_e32 v32, s84 +; VI-NEXT: v_mov_b32_e32 v37, s73 +; VI-NEXT: v_mov_b32_e32 v51, s72 +; VI-NEXT: v_mov_b32_e32 v52, s61 +; VI-NEXT: v_mov_b32_e32 v48, s60 +; VI-NEXT: v_mov_b32_e32 v49, s15 +; VI-NEXT: v_mov_b32_e32 v50, s14 +; VI-NEXT: v_mov_b32_e32 v40, s83 +; VI-NEXT: v_mov_b32_e32 v41, s82 +; VI-NEXT: v_mov_b32_e32 v53, s63 +; VI-NEXT: v_mov_b32_e32 v54, s57 +; VI-NEXT: v_mov_b32_e32 v55, s56 +; VI-NEXT: v_mov_b32_e32 v45, s59 +; VI-NEXT: v_mov_b32_e32 v46, s45 +; VI-NEXT: v_mov_b32_e32 v42, s41 +; VI-NEXT: v_mov_b32_e32 v43, s47 +; VI-NEXT: v_mov_b32_e32 v44, s46 +; VI-NEXT: v_mov_b32_e32 v26, s5 +; VI-NEXT: v_mov_b32_e32 v23, s76 +; VI-NEXT: v_mov_b32_e32 v22, s77 +; VI-NEXT: v_mov_b32_e32 v20, s88 +; VI-NEXT: v_mov_b32_e32 v19, s89 +; VI-NEXT: v_mov_b32_e32 v17, s90 +; VI-NEXT: v_mov_b32_e32 v16, s91 +; VI-NEXT: v_mov_b32_e32 v11, s34 +; VI-NEXT: v_mov_b32_e32 v10, s35 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s54 +; VI-NEXT: v_mov_b32_e32 v30, s70 +; VI-NEXT: v_mov_b32_e32 v31, s80 +; VI-NEXT: v_readlane_b32 s11, v61, 26 +; VI-NEXT: v_readlane_b32 s13, v61, 27 +; VI-NEXT: v_readlane_b32 s14, v61, 28 +; VI-NEXT: v_readlane_b32 s15, v61, 29 +; VI-NEXT: v_readlane_b32 s17, v61, 30 +; VI-NEXT: v_readlane_b32 s19, v61, 31 +; VI-NEXT: v_readlane_b32 s21, v61, 32 +; VI-NEXT: v_readlane_b32 s23, v61, 33 +; VI-NEXT: v_readlane_b32 s25, v61, 34 +; VI-NEXT: v_readlane_b32 s27, v61, 35 +; VI-NEXT: v_readlane_b32 s29, v61, 36 +; VI-NEXT: v_readlane_b32 s41, v61, 37 +; VI-NEXT: v_readlane_b32 s42, v61, 38 +; VI-NEXT: v_readlane_b32 s43, v61, 39 +; VI-NEXT: v_readlane_b32 s45, v61, 40 +; VI-NEXT: v_readlane_b32 s46, v61, 41 +; VI-NEXT: v_readlane_b32 s47, v61, 42 +; VI-NEXT: v_readlane_b32 s56, v61, 43 +; VI-NEXT: v_readlane_b32 s57, v61, 44 +; VI-NEXT: v_readlane_b32 s59, v61, 45 +; VI-NEXT: v_readlane_b32 s60, v61, 46 +; VI-NEXT: v_readlane_b32 s61, v61, 47 +; VI-NEXT: v_readlane_b32 s63, v61, 48 +; VI-NEXT: v_readlane_b32 s72, v61, 49 +; VI-NEXT: v_readlane_b32 s73, v61, 50 +; VI-NEXT: v_readlane_b32 s75, v61, 51 +; VI-NEXT: v_readlane_b32 s76, v61, 52 +; VI-NEXT: v_readlane_b32 s77, v61, 53 +; VI-NEXT: v_readlane_b32 s79, v61, 54 +; VI-NEXT: v_readlane_b32 s88, v61, 55 +; VI-NEXT: v_readlane_b32 s89, v61, 56 +; VI-NEXT: v_readlane_b32 s90, v61, 57 +; VI-NEXT: v_readlane_b32 s91, v61, 58 +; VI-NEXT: v_readlane_b32 s31, v61, 59 +; VI-NEXT: v_readlane_b32 s34, v61, 60 +; VI-NEXT: v_readlane_b32 s37, v61, 9 +; VI-NEXT: v_readlane_b32 vcc_lo, v61, 61 +; VI-NEXT: v_readlane_b32 vcc_hi, v61, 62 +; VI-NEXT: v_readlane_b32 s35, v61, 63 +; VI-NEXT: v_readlane_b32 s9, v62, 0 +; VI-NEXT: v_readlane_b32 s7, v62, 1 +; VI-NEXT: v_readlane_b32 s39, v61, 7 +; VI-NEXT: v_readlane_b32 s49, v61, 5 +; VI-NEXT: v_readlane_b32 s51, v61, 3 +; VI-NEXT: v_readlane_b32 s53, v61, 1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s64 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s66 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v28, s68 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: .LBB91_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v28, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xff +; VI-NEXT: s_lshl_b32 s6, s35, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, vcc_hi, 0xff +; VI-NEXT: s_lshl_b32 s7, vcc_lo, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s34, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s31, 0xff +; VI-NEXT: s_lshl_b32 s7, s50, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v28, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s6, s91, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s90, 0xff +; VI-NEXT: s_lshl_b32 s7, s89, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s88, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s79, 0xff +; VI-NEXT: s_lshl_b32 s7, s48, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s6, s77, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s76, 0xff +; VI-NEXT: s_lshl_b32 s7, s75, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s73, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s72, 0xff +; VI-NEXT: s_lshl_b32 s7, s38, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s6, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s61, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s59, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s57, 0xff +; VI-NEXT: s_lshl_b32 s7, s36, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 28, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s47, 0xff +; VI-NEXT: s_lshl_b32 s7, s46, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 32, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s43, 0xff +; VI-NEXT: s_lshl_b32 s7, s30, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 36, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s41, 0xff +; VI-NEXT: s_lshl_b32 s7, s29, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 40, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s25, 0xff +; VI-NEXT: s_lshl_b32 s7, s78, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 44, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s5, s58, 0xff +; VI-NEXT: s_lshl_b32 s6, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s19, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 48, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s15, 0xff +; VI-NEXT: s_lshl_b32 s6, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v28, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: s_and_b32 s4, s62, 0xff +; VI-NEXT: s_lshl_b32 s5, s14, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s13, 0xff +; VI-NEXT: s_lshl_b32 s6, s11, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v31 +; VI-NEXT: v_add_u32_e32 v28, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v28, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v25, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v29, v28, s[0:3], 0 offen +; VI-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v47 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s30, v63, 30 ; VI-NEXT: v_readlane_b32 s31, v63, 31 ; VI-NEXT: v_readlane_b32 s87, v63, 29 @@ -172055,393 +173225,121 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s36, v63, 2 ; VI-NEXT: v_readlane_b32 s35, v63, 1 ; VI-NEXT: v_readlane_b32 s34, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -227053,19 +227951,19 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v13 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v38, v3 ; VI-NEXT: v_mov_b32_e32 v28, v14 -; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 -; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 -; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 -; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v48, v4 ; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -227074,583 +227972,595 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB101_4 ; VI-NEXT: .LBB101_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v17 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_f32_e32 v2, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v49, v5, v7, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v9, v11, vcc +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; VI-NEXT: v_mov_b32_e32 v5, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v7, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v13, v15, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_mov_b32_e32 v11, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v17, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_mov_b32_e32 v13, v18 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v48 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v38, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v19, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v18 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v23, v25, vcc +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_mov_b32_e32 v15, v49 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_bfe_u32 v23, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v20 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v25, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v25, v27, vcc +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_bfe_u32 v25, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v22 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v27, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v27, v29, vcc +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_bfe_u32 v27, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v29, v35, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v24 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v27 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v29, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_bfe_u32 v29, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v26 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v52, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v52, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v52, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v29 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v33 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 -; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 -; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v54, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v54, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v33 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v33, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v32 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v54, v55, vcc ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc ; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 -; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_cndmask_b32_e32 v40, v40, v41, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_mov_b32_e32 v37, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 +; VI-NEXT: v_mov_b32_e32 v35, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[52:53] ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_mov_b32_e32 v33, v48 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v31, v50 ; VI-NEXT: s_branch .LBB101_5 ; VI-NEXT: .LBB101_3: ; VI-NEXT: s_branch .LBB101_2 @@ -227675,7 +228585,14 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s30, v42, 0 -; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v17, v38 +; VI-NEXT: v_mov_b32_e32 v18, v48 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v21, v36 +; VI-NEXT: v_mov_b32_e32 v23, v35 +; VI-NEXT: v_mov_b32_e32 v25, v34 +; VI-NEXT: v_mov_b32_e32 v27, v33 +; VI-NEXT: v_mov_b32_e32 v29, v32 ; VI-NEXT: v_readlane_b32 s31, v42, 1 ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -231291,1105 +232208,1194 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v0 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_mov_b32_e32 v46, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v29, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v50 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB103_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_mov_b32_e32 v23, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_mov_b32_e32 v51, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v44 -; SI-NEXT: v_mov_b32_e32 v50, v26 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_mov_b32_e32 v38, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v58 -; SI-NEXT: v_mov_b32_e32 v58, v5 -; SI-NEXT: v_mov_b32_e32 v59, v11 -; SI-NEXT: v_mov_b32_e32 v60, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: v_mov_b32_e32 v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_mov_b32_e32 v38, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v59 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_mov_b32_e32 v11, v32 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v55 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v20 ; SI-NEXT: s_branch .LBB103_3 ; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v50, v26 -; SI-NEXT: v_mov_b32_e32 v36, v22 -; SI-NEXT: v_mov_b32_e32 v51, v21 +; SI-NEXT: v_mov_b32_e32 v11, v32 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v38, v54 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v55 +; SI-NEXT: v_mov_b32_e32 v49, v40 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v19, v20 -; SI-NEXT: v_mov_b32_e32 v6, v27 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v61, v2 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v2 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB103_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 +; SI-NEXT: v_mov_b32_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v42 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v50 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v52 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v30 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v59 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v57 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v51, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v49 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v20 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v19, v60 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v46 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v36 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_mov_b32_e32 v17, v11 -; SI-NEXT: v_mov_b32_e32 v16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v35, v19 +; SI-NEXT: v_mov_b32_e32 v8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v22 -; SI-NEXT: v_mov_b32_e32 v22, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: .LBB103_5: ; %end -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -236154,1081 +237160,1237 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v54 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v53 ; SI-NEXT: v_mul_f32_e32 v51, 1.0, v55 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v43 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v44 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v45 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v57 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v59 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s29 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v60 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v60 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v52 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v52, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v49 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: v_mov_b32_e32 v48, v27 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v53 +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v46 +; SI-NEXT: v_mov_b32_e32 v46, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_mov_b32_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v54, v50 -; SI-NEXT: v_mov_b32_e32 v46, v19 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_mov_b32_e32 v44, v15 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: v_mov_b32_e32 v52, v62 -; SI-NEXT: v_mov_b32_e32 v21, v58 -; SI-NEXT: v_mov_b32_e32 v58, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: v_mov_b32_e32 v42, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_mov_b32_e32 v5, v19 -; SI-NEXT: v_mov_b32_e32 v7, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_mov_b32_e32 v47, v3 -; SI-NEXT: v_mov_b32_e32 v3, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_mov_b32_e32 v1, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_mov_b32_e32 v39, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 ; SI-NEXT: s_branch .LBB105_3 ; SI-NEXT: .LBB105_2: -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v25, v1 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v21, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v52, v62 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v35 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v46, v9 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v52, v63 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v50 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: v_mov_b32_e32 v53, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v40, v3 -; SI-NEXT: v_mov_b32_e32 v44, v15 -; SI-NEXT: v_mov_b32_e32 v57, v13 -; SI-NEXT: v_mov_b32_e32 v46, v19 -; SI-NEXT: v_mov_b32_e32 v41, v27 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v42, v43 -; SI-NEXT: v_mov_b32_e32 v3, v17 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v13, v37 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v19, v48 +; SI-NEXT: v_mov_b32_e32 v63, v7 +; SI-NEXT: v_mov_b32_e32 v58, v53 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: s_cbranch_vccnz .LBB105_5 ; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_mov_b32_e32 v7, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_mov_b32_e32 v17, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31 -; SI-NEXT: v_mov_b32_e32 v38, v9 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_mov_b32_e32 v11, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v61, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v12, v10, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v14, v12, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v18, v14, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v20, v18, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v24, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[48:49], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[7:8], 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 -; SI-NEXT: v_alignbit_b32 v26, v59, v25, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[51:52], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16 -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v2, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_mov_b32_e32 v4, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[4:5], v[37:38], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; SI-NEXT: v_mov_b32_e32 v6, v49 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[6:7], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; SI-NEXT: v_mov_b32_e32 v8, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[51:52], v[61:62], 16 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v45, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v28, v58, v27, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v55, v20, 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[19:20], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_alignbit_b32 v35, v43, v32, 16 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[62:63], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_alignbit_b32 v39, v29, v32, 16 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[31:32], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[5:6], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[27:28], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[23:24], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[13:14], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[53:54], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_mov_b32_e32 v16, v33 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v18 +; SI-NEXT: v_mov_b32_e32 v18, v41 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v39, v59 +; SI-NEXT: v_mov_b32_e32 v40, v60 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20 +; SI-NEXT: v_lshr_b64 v[20:21], v[39:40], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; SI-NEXT: v_lshr_b64 v[22:23], v[56:57], 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[24:25], v[46:47], 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v26 +; SI-NEXT: v_lshr_b64 v[26:27], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v27, v30 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; SI-NEXT: v_lshr_b64 v[33:34], v[29:30], 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 +; SI-NEXT: v_mov_b32_e32 v30, v44 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[30:31], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[25:26], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[27:28], v[23:24], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[21:22], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[19:20], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[17:18], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[15:16], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[29:30], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[27:28], v[7:8], 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -237263,19 +238425,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v13 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v9 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v5 +; VI-NEXT: v_mov_b32_e32 v38, v3 ; VI-NEXT: v_mov_b32_e32 v28, v14 -; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 -; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 -; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 -; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v48, v4 ; VI-NEXT: v_mov_b32_e32 v16, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -237284,583 +238446,595 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB105_4 ; VI-NEXT: .LBB105_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v17 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_f32_e32 v2, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v17 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v5, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v17 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v49, v5, v7, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v17 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v17 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v3, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v9, v11, vcc +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v17 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; VI-NEXT: v_mov_b32_e32 v5, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v17 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v7, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v13, v15, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_mov_b32_e32 v9, v18 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v18, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; VI-NEXT: v_add_f32_e32 v18, s4, v0 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v17 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v17 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_mov_b32_e32 v11, v49 +; VI-NEXT: v_cndmask_b32_e32 v49, v17, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_mov_b32_e32 v13, v18 +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v18 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v48 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v38, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v19, v21, v23, vcc +; VI-NEXT: v_bfe_u32 v21, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v18 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v23, v25, vcc +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[49:50] +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_mov_b32_e32 v15, v49 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v20 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 -; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_bfe_u32 v23, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v48, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v20 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 -; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 -; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v25, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v25, v27, vcc +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v36 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v22 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 -; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_bfe_u32 v25, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v36, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v22 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 -; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v27, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v27, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v27, v29, vcc +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v24 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 -; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_bfe_u32 v27, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v50, v29, v35, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v24 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 -; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v29, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v27 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v29, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v26 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_bfe_u32 v29, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v26 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 ; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v52, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v52, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v52, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v29 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v33 ; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 -; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 -; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v28 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 -; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v33, v54, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v54, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v33 +; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v54, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v33 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v33, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v32 ; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 -; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v54, v55, vcc ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 -; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 -; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc ; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 ; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 -; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v40, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v30 ; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 -; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[48:49] +; VI-NEXT: v_cndmask_b32_e32 v40, v40, v41, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[36:37] +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_mov_b32_e32 v37, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[50:51] +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 +; VI-NEXT: v_mov_b32_e32 v35, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[52:53] ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[50:51], 16, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[38:39] +; VI-NEXT: v_mov_b32_e32 v33, v48 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[48:49], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v31, v50 ; VI-NEXT: s_branch .LBB105_5 ; VI-NEXT: .LBB105_3: ; VI-NEXT: s_branch .LBB105_2 @@ -237885,7 +239059,14 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s30, v42, 0 -; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v17, v38 +; VI-NEXT: v_mov_b32_e32 v18, v48 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v21, v36 +; VI-NEXT: v_mov_b32_e32 v23, v35 +; VI-NEXT: v_mov_b32_e32 v25, v34 +; VI-NEXT: v_mov_b32_e32 v27, v33 +; VI-NEXT: v_mov_b32_e32 v29, v32 ; VI-NEXT: v_readlane_b32 s31, v42, 1 ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -240931,49 +242112,54 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: s_mov_b32 s74, s23 +; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: s_mov_b32 s61, s18 ; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_mov_b32 s60, s16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s17, 0 -; SI-NEXT: s_mov_b32 s61, s19 ; SI-NEXT: v_writelane_b32 v41, s60, 1 -; SI-NEXT: s_mov_b32 s63, s18 -; SI-NEXT: v_writelane_b32 v41, s61, 2 -; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: v_writelane_b32 v41, s63, 3 +; SI-NEXT: v_writelane_b32 v41, s19, 2 +; SI-NEXT: v_writelane_b32 v41, s61, 3 ; SI-NEXT: v_writelane_b32 v41, s72, 4 -; SI-NEXT: s_mov_b32 s74, s23 ; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_writelane_b32 v41, s74, 6 -; SI-NEXT: s_mov_b32 s75, s25 +; SI-NEXT: s_mov_b32 s76, s25 ; SI-NEXT: v_writelane_b32 v41, s22, 7 -; SI-NEXT: v_writelane_b32 v41, s75, 8 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s76, 8 +; SI-NEXT: s_mov_b32 s78, s27 ; SI-NEXT: v_writelane_b32 v41, s24, 9 -; SI-NEXT: v_writelane_b32 v41, s76, 10 -; SI-NEXT: s_mov_b32 s93, s29 +; SI-NEXT: v_writelane_b32 v41, s78, 10 +; SI-NEXT: s_mov_b32 s88, s29 ; SI-NEXT: v_writelane_b32 v41, s26, 11 -; SI-NEXT: v_writelane_b32 v41, s93, 12 -; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_writelane_b32 v41, s88, 12 +; SI-NEXT: v_readfirstlane_b32 s77, v2 ; SI-NEXT: v_writelane_b32 v41, s28, 13 -; SI-NEXT: v_readfirstlane_b32 s73, v4 -; SI-NEXT: v_writelane_b32 v41, s16, 14 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_writelane_b32 v41, s73, 15 -; SI-NEXT: v_readfirstlane_b32 s90, v6 -; SI-NEXT: v_writelane_b32 v41, s89, 16 -; SI-NEXT: v_readfirstlane_b32 s91, v5 -; SI-NEXT: v_writelane_b32 v41, s90, 17 -; SI-NEXT: v_readfirstlane_b32 s34, v8 -; SI-NEXT: v_writelane_b32 v41, s91, 18 -; SI-NEXT: v_readfirstlane_b32 s35, v7 -; SI-NEXT: v_writelane_b32 v41, s34, 19 -; SI-NEXT: v_readfirstlane_b32 s36, v10 -; SI-NEXT: v_writelane_b32 v41, s35, 20 -; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_writelane_b32 v41, s36, 21 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_writelane_b32 v41, s37, 22 +; SI-NEXT: v_readfirstlane_b32 s79, v4 +; SI-NEXT: v_writelane_b32 v41, s77, 14 +; SI-NEXT: v_readfirstlane_b32 s90, v3 +; SI-NEXT: v_writelane_b32 v41, s79, 15 +; SI-NEXT: v_readfirstlane_b32 s91, v6 +; SI-NEXT: v_writelane_b32 v41, s90, 16 +; SI-NEXT: v_readfirstlane_b32 s92, v5 +; SI-NEXT: v_writelane_b32 v41, s91, 17 +; SI-NEXT: v_readfirstlane_b32 s93, v8 +; SI-NEXT: v_writelane_b32 v41, s92, 18 +; SI-NEXT: v_readfirstlane_b32 s94, v7 +; SI-NEXT: v_writelane_b32 v41, s93, 19 +; SI-NEXT: v_readfirstlane_b32 s95, v10 +; SI-NEXT: v_writelane_b32 v41, s94, 20 +; SI-NEXT: v_readfirstlane_b32 s30, v9 +; SI-NEXT: v_writelane_b32 v41, s95, 21 +; SI-NEXT: v_readfirstlane_b32 s31, v12 +; SI-NEXT: v_writelane_b32 v41, s30, 22 +; SI-NEXT: v_writelane_b32 v41, s31, 23 +; SI-NEXT: v_readfirstlane_b32 s34, v11 +; SI-NEXT: v_readfirstlane_b32 s35, v14 +; SI-NEXT: v_readfirstlane_b32 s36, v13 +; SI-NEXT: v_readfirstlane_b32 s37, v16 +; SI-NEXT: v_readfirstlane_b32 s38, v15 ; SI-NEXT: v_readfirstlane_b32 s14, v30 ; SI-NEXT: v_readfirstlane_b32 s15, v29 ; SI-NEXT: v_readfirstlane_b32 s12, v28 @@ -240982,26 +242168,18 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_readfirstlane_b32 s11, v25 ; SI-NEXT: v_readfirstlane_b32 s8, v24 ; SI-NEXT: v_readfirstlane_b32 s9, v23 -; SI-NEXT: v_readfirstlane_b32 s88, v22 -; SI-NEXT: v_readfirstlane_b32 s29, v21 -; SI-NEXT: v_readfirstlane_b32 s79, v20 -; SI-NEXT: v_readfirstlane_b32 s27, v19 -; SI-NEXT: v_readfirstlane_b32 s78, v18 -; SI-NEXT: v_readfirstlane_b32 s25, v17 -; SI-NEXT: v_readfirstlane_b32 s77, v16 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s39, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_writelane_b32 v41, s38, 23 -; SI-NEXT: v_writelane_b32 v41, s39, 24 +; SI-NEXT: v_readfirstlane_b32 s89, v22 +; SI-NEXT: v_readfirstlane_b32 s7, v21 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_readfirstlane_b32 s29, v19 +; SI-NEXT: v_readfirstlane_b32 s39, v18 +; SI-NEXT: v_readfirstlane_b32 s27, v17 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s80, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v33 +; SI-NEXT: v_readfirstlane_b32 s75, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 @@ -241013,13 +242191,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s84, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s23, v35 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v36 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s87, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: v_readfirstlane_b32 s18, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 @@ -241043,261 +242221,289 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s42, v34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_writelane_b32 v41, s5, 24 +; SI-NEXT: v_writelane_b32 v41, s34, 25 +; SI-NEXT: v_writelane_b32 v41, s35, 26 +; SI-NEXT: v_writelane_b32 v41, s36, 27 +; SI-NEXT: v_writelane_b32 v41, s37, 28 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s43, v35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s40, v36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s41, v37 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_writelane_b32 v41, s38, 29 +; SI-NEXT: v_writelane_b32 v41, s39, 30 ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshl_b32 s4, s60, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 25 -; SI-NEXT: s_lshl_b32 s4, s63, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 26 -; SI-NEXT: s_lshl_b32 s4, s20, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 27 -; SI-NEXT: s_lshl_b32 s4, s22, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 28 -; SI-NEXT: s_lshl_b32 s4, s24, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 29 -; SI-NEXT: s_lshl_b32 s4, s26, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 30 -; SI-NEXT: s_lshl_b32 s4, s28, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 31 -; SI-NEXT: s_lshl_b32 s4, s18, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 32 -; SI-NEXT: s_lshl_b32 s4, s89, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 33 -; SI-NEXT: s_lshl_b32 s4, s91, 16 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 31 +; SI-NEXT: s_lshl_b32 s4, s61, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 34 -; SI-NEXT: s_lshl_b32 s4, s35, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 35 -; SI-NEXT: s_lshl_b32 s4, s37, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s96, s61, 16 -; SI-NEXT: s_lshl_b32 s99, s72, 16 -; SI-NEXT: s_lshl_b32 s97, s74, 16 -; SI-NEXT: s_lshl_b32 s92, s75, 16 -; SI-NEXT: s_lshl_b32 s94, s76, 16 -; SI-NEXT: s_lshl_b32 s95, s93, 16 -; SI-NEXT: s_lshl_b32 s93, s16, 16 -; SI-NEXT: s_lshl_b32 s30, s73, 16 -; SI-NEXT: s_lshl_b32 s31, s90, 16 -; SI-NEXT: s_lshl_b32 s34, s34, 16 +; SI-NEXT: s_lshl_b32 s4, s19, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 33 +; SI-NEXT: s_lshl_b32 s4, s20, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 36 -; SI-NEXT: s_lshl_b32 s35, s36, 16 -; SI-NEXT: s_lshl_b32 s86, s19, 16 -; SI-NEXT: s_lshl_b32 s36, s38, 16 -; SI-NEXT: s_lshl_b32 s22, s21, 16 -; SI-NEXT: s_lshl_b32 s37, s39, 16 -; SI-NEXT: s_lshl_b32 s24, s23, 16 -; SI-NEXT: s_lshl_b32 s38, s77, 16 -; SI-NEXT: s_lshl_b32 s28, s25, 16 -; SI-NEXT: s_lshl_b32 s39, s78, 16 -; SI-NEXT: s_lshl_b32 s61, s27, 16 -; SI-NEXT: s_lshl_b32 s48, s79, 16 -; SI-NEXT: s_lshl_b32 s89, s29, 16 -; SI-NEXT: s_lshl_b32 s49, s88, 16 -; SI-NEXT: s_lshl_b32 s60, s9, 16 -; SI-NEXT: s_lshl_b32 s50, s8, 16 -; SI-NEXT: s_lshl_b32 s90, s11, 16 -; SI-NEXT: s_lshl_b32 s91, s10, 16 -; SI-NEXT: s_lshl_b32 s70, s13, 16 -; SI-NEXT: s_lshl_b32 s51, s12, 16 -; SI-NEXT: s_lshl_b32 s71, s15, 16 -; SI-NEXT: s_lshl_b32 s52, s14, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 16 -; SI-NEXT: s_lshl_b32 s53, s40, 16 -; SI-NEXT: s_lshl_b32 s81, s43, 16 -; SI-NEXT: s_lshl_b32 s54, s42, 16 -; SI-NEXT: s_lshl_b32 s63, s45, 16 -; SI-NEXT: s_lshl_b32 s55, s44, 16 -; SI-NEXT: s_lshl_b32 s72, s47, 16 -; SI-NEXT: s_lshl_b32 s64, s46, 16 -; SI-NEXT: s_lshl_b32 s82, s57, 16 -; SI-NEXT: s_lshl_b32 s65, s56, 16 -; SI-NEXT: s_lshl_b32 s74, s59, 16 -; SI-NEXT: s_lshl_b32 s66, s58, 16 -; SI-NEXT: s_lshl_b32 s75, s87, 16 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: s_lshl_b32 s67, s6, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 16 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: s_lshl_b32 s68, s68, 16 -; SI-NEXT: s_lshl_b32 s85, s84, 16 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: s_lshl_b32 s69, s69, 16 -; SI-NEXT: s_lshl_b32 s17, s80, 16 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: s_lshl_b32 s26, s62, 16 +; SI-NEXT: s_lshl_b32 s4, s72, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: s_lshl_b32 s4, s74, 16 +; SI-NEXT: s_lshl_b32 s16, s22, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 37 +; SI-NEXT: s_lshl_b32 s6, s24, 16 +; SI-NEXT: s_lshl_b32 s73, s76, 16 +; SI-NEXT: s_lshl_b32 s98, s26, 16 +; SI-NEXT: s_lshl_b32 s63, s78, 16 +; SI-NEXT: s_lshl_b32 s96, s28, 16 +; SI-NEXT: s_lshl_b32 s62, s88, 16 +; SI-NEXT: s_lshl_b32 s97, s5, 16 +; SI-NEXT: s_lshl_b32 s99, s77, 16 +; SI-NEXT: s_lshl_b32 s85, s90, 16 +; SI-NEXT: s_lshl_b32 s86, s79, 16 +; SI-NEXT: s_lshl_b32 s81, s92, 16 +; SI-NEXT: s_lshl_b32 s82, s91, 16 +; SI-NEXT: s_lshl_b32 s70, s94, 16 +; SI-NEXT: s_lshl_b32 s71, s93, 16 +; SI-NEXT: s_lshl_b32 s68, s30, 16 +; SI-NEXT: s_lshl_b32 s69, s95, 16 +; SI-NEXT: s_lshl_b32 s66, s34, 16 +; SI-NEXT: s_lshl_b32 s67, s31, 16 +; SI-NEXT: s_lshl_b32 s64, s36, 16 +; SI-NEXT: s_lshl_b32 s65, s35, 16 +; SI-NEXT: s_lshl_b32 s54, s38, 16 +; SI-NEXT: s_lshl_b32 s55, s37, 16 +; SI-NEXT: s_lshl_b32 s52, s27, 16 +; SI-NEXT: s_lshl_b32 s53, s39, 16 +; SI-NEXT: s_lshl_b32 s50, s29, 16 +; SI-NEXT: s_lshl_b32 s51, s25, 16 +; SI-NEXT: s_lshl_b32 s48, s7, 16 +; SI-NEXT: s_lshl_b32 s49, s89, 16 +; SI-NEXT: s_lshl_b32 s38, s9, 16 +; SI-NEXT: s_lshl_b32 s39, s8, 16 +; SI-NEXT: s_lshl_b32 s37, s11, 16 +; SI-NEXT: s_lshl_b32 s35, s10, 16 +; SI-NEXT: s_lshl_b32 s31, s13, 16 +; SI-NEXT: s_lshl_b32 s36, s12, 16 +; SI-NEXT: s_lshl_b32 s95, s15, 16 +; SI-NEXT: s_lshl_b32 s34, s14, 16 +; SI-NEXT: s_lshl_b32 s93, s41, 16 +; SI-NEXT: s_lshl_b32 s30, s40, 16 +; SI-NEXT: s_lshl_b32 s91, s43, 16 +; SI-NEXT: s_lshl_b32 s94, s42, 16 +; SI-NEXT: s_lshl_b32 s92, s45, 16 +; SI-NEXT: s_lshl_b32 s90, s44, 16 +; SI-NEXT: s_lshl_b32 s88, s47, 16 +; SI-NEXT: s_lshl_b32 s28, s46, 16 +; SI-NEXT: s_lshl_b32 s78, s57, 16 +; SI-NEXT: s_lshl_b32 s26, s56, 16 +; SI-NEXT: s_lshl_b32 s76, s59, 16 +; SI-NEXT: s_lshl_b32 s24, s58, 16 +; SI-NEXT: s_lshl_b32 s74, s87, 16 +; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: s_lshl_b32 s22, s18, 16 +; SI-NEXT: s_lshl_b32 s72, s83, 16 +; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: s_lshl_b32 s20, s23, 16 +; SI-NEXT: s_lshl_b32 s61, s84, 16 +; SI-NEXT: s_mov_b32 s18, s75 +; SI-NEXT: s_lshl_b32 s19, s75, 16 +; SI-NEXT: s_lshl_b32 s60, s80, 16 +; SI-NEXT: s_lshl_b32 s17, s21, 16 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s18, s75 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b32 s4, s60 ; SI-NEXT: s_mov_b32 s5, s17 -; SI-NEXT: s_mov_b32 s17, s86 -; SI-NEXT: s_mov_b32 s86, s7 +; SI-NEXT: s_mov_b32 s17, s61 +; SI-NEXT: s_mov_b32 s60, s72 +; SI-NEXT: s_mov_b32 s61, s74 +; SI-NEXT: s_mov_b32 s72, s76 +; SI-NEXT: s_mov_b32 s74, s78 +; SI-NEXT: s_mov_b32 s76, s88 +; SI-NEXT: s_mov_b32 s78, s92 +; SI-NEXT: s_mov_b32 s88, s91 +; SI-NEXT: s_mov_b32 s91, s93 +; SI-NEXT: s_mov_b32 s92, s94 +; SI-NEXT: s_mov_b32 s93, s95 +; SI-NEXT: s_mov_b32 s94, s30 +; SI-NEXT: s_mov_b32 s95, s31 +; SI-NEXT: s_mov_b32 s30, s34 +; SI-NEXT: s_mov_b32 s31, s37 +; SI-NEXT: s_mov_b32 s34, s36 +; SI-NEXT: s_mov_b32 s36, s38 +; SI-NEXT: s_mov_b32 s37, s39 +; SI-NEXT: s_mov_b32 s38, s48 +; SI-NEXT: s_mov_b32 s39, s49 +; SI-NEXT: s_mov_b32 s48, s50 +; SI-NEXT: s_mov_b32 s49, s51 +; SI-NEXT: s_mov_b32 s50, s52 +; SI-NEXT: s_mov_b32 s51, s53 +; SI-NEXT: s_mov_b32 s52, s54 +; SI-NEXT: s_mov_b32 s53, s55 +; SI-NEXT: s_mov_b32 s54, s6 +; SI-NEXT: s_mov_b32 s55, s16 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_lshl_b32 s5, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 24 -; SI-NEXT: s_lshl_b32 s20, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 23 -; SI-NEXT: s_lshl_b32 s17, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 22 -; SI-NEXT: s_lshl_b32 s61, s16, 16 -; SI-NEXT: s_add_i32 s16, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 21 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s7, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 20 -; SI-NEXT: s_or_b32 s7, s7, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v41, 19 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_readlane_b32 s16, v41, 18 -; SI-NEXT: s_lshl_b32 s60, s98, 16 -; SI-NEXT: s_or_b32 s17, s17, s19 -; SI-NEXT: s_add_i32 s98, s16, 3 -; SI-NEXT: v_readlane_b32 s19, v41, 17 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_and_b32 s16, s98, 0xffff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s19, s16 -; SI-NEXT: v_readlane_b32 s19, v41, 16 ; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 30 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s13, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 29 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_lshl_b32 s11, s25, 16 +; SI-NEXT: s_add_i32 s25, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 28 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_lshl_b32 s15, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 27 +; SI-NEXT: s_add_i32 s23, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 26 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshl_b32 s20, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 25 +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: s_add_i32 s96, s19, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 15 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s21, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 23 ; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s9, s89, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_lshl_b32 s19, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 22 +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_and_b32 s9, s29, 0xffff -; SI-NEXT: s_lshl_b32 s11, s88, 16 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_and_b32 s19, s96, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_add_i32 s16, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 21 ; SI-NEXT: s_or_b32 s9, s11, s9 ; SI-NEXT: s_and_b32 s11, s27, 0xffff -; SI-NEXT: s_lshl_b32 s13, s79, 16 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_or_b32 s19, s21, s19 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 14 -; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 20 ; SI-NEXT: s_or_b32 s11, s13, s11 ; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_lshl_b32 s15, s78, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 19 ; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_lshl_b32 s22, s77, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s60, s18, 16 +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_or_b32 s6, s17, s6 +; SI-NEXT: v_readlane_b32 s17, v41, 18 +; SI-NEXT: v_readlane_b32 s18, v41, 17 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_add_i32 s98, s17, 3 +; SI-NEXT: s_lshl_b32 s20, s18, 16 +; SI-NEXT: v_readlane_b32 s18, v41, 16 +; SI-NEXT: s_and_b32 s17, s98, 0xffff +; SI-NEXT: s_add_i32 s96, s18, 3 +; SI-NEXT: v_readlane_b32 s18, v41, 15 +; SI-NEXT: s_or_b32 s17, s20, s17 +; SI-NEXT: s_and_b32 s20, s96, 0xffff +; SI-NEXT: s_lshl_b32 s21, s18, 16 +; SI-NEXT: v_readlane_b32 s18, v41, 24 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_or_b32 s18, s21, s18 ; SI-NEXT: v_readlane_b32 s21, v41, 13 -; SI-NEXT: s_or_b32 s15, s22, s15 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_readlane_b32 s22, v41, 12 ; SI-NEXT: s_and_b32 s21, s21, 0xffff @@ -241339,42 +242545,20 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_add_i32 s27, s27, 0x30000 -; SI-NEXT: s_add_i32 s26, s26, 0x30000 -; SI-NEXT: s_and_b32 s86, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: s_add_i32 s25, s25, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s27, 25 -; SI-NEXT: s_and_b32 s96, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_add_i32 s24, s24, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s26, 26 -; SI-NEXT: s_and_b32 s99, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s25, 27 -; SI-NEXT: s_and_b32 s97, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_add_i32 s80, s80, 3 -; SI-NEXT: s_add_i32 s22, s22, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s24, 28 -; SI-NEXT: s_and_b32 s92, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_add_i32 s27, s27, 0x30000 ; SI-NEXT: s_and_b32 s4, s80, 0xffff ; SI-NEXT: s_add_i32 s84, s84, 3 -; SI-NEXT: s_add_i32 s21, s21, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s23, 29 -; SI-NEXT: s_and_b32 s94, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s27, 0xffff0000 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s84, 0xffff ; SI-NEXT: s_add_i32 s83, s83, 3 -; SI-NEXT: s_add_i32 s18, s18, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s22, 30 -; SI-NEXT: s_and_b32 s95, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_add_i32 s26, s26, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s28, 31 +; SI-NEXT: s_lshl_b32 s27, s27, 16 ; SI-NEXT: s_or_b32 s5, s60, s5 ; SI-NEXT: s_and_b32 s60, s83, 0xffff +; SI-NEXT: s_lshl_b32 s61, s79, 16 ; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_add_i32 s59, s59, 3 ; SI-NEXT: s_add_i32 s57, s57, 3 @@ -241382,13 +242566,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s19, s19, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s21, 31 -; SI-NEXT: s_and_b32 s93, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s76, s61, s60 +; SI-NEXT: v_writelane_b32 v41, s27, 32 +; SI-NEXT: s_and_b32 s27, s26, 0xffff0000 +; SI-NEXT: s_or_b32 vcc_lo, s61, s60 ; SI-NEXT: s_and_b32 s60, s87, 0xffff -; SI-NEXT: s_lshl_b32 s61, s73, 16 +; SI-NEXT: s_lshl_b32 s61, s77, 16 ; SI-NEXT: s_and_b32 s59, s59, 0xffff ; SI-NEXT: s_lshl_b32 s58, s58, 16 ; SI-NEXT: s_and_b32 s57, s57, 0xffff @@ -241401,24 +242583,22 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_and_b32 s41, s41, 0xffff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_add_i32 s16, s16, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 32 -; SI-NEXT: s_lshl_b32 s18, s19, 16 -; SI-NEXT: s_or_b32 s75, s61, s60 +; SI-NEXT: s_add_i32 s25, s25, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s27, 33 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 vcc_hi, s61, s60 ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s46, s46, s47 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_or_b32 s42, s42, s43 ; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 33 -; SI-NEXT: s_and_b32 s31, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v41, s26, 34 +; SI-NEXT: s_and_b32 s26, s25, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s76, s76, 0x30000 -; SI-NEXT: s_add_i32 s75, s75, 0x30000 +; SI-NEXT: s_add_i32 vcc_lo, vcc_lo, 0x30000 +; SI-NEXT: s_add_i32 vcc_hi, vcc_hi, 0x30000 ; SI-NEXT: s_add_i32 s58, s58, 0x30000 ; SI-NEXT: s_add_i32 s56, s56, 0x30000 ; SI-NEXT: s_add_i32 s46, s46, 0x30000 @@ -241429,294 +242609,311 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s12, s12, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s13, s13, 0x30000 ; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 ; SI-NEXT: s_add_i32 s17, s17, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 34 -; SI-NEXT: s_and_b32 s34, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s30, s19, 0xffff0000 -; SI-NEXT: v_writelane_b32 v41, s6, 35 -; SI-NEXT: s_and_b32 s35, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s7, 16 -; SI-NEXT: s_and_b32 s36, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s37, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s20, 16 -; SI-NEXT: s_and_b32 s38, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s15, 16 -; SI-NEXT: s_and_b32 s39, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s13, 16 -; SI-NEXT: s_and_b32 s48, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s11, 16 +; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: s_add_i32 s22, s22, 0x30000 +; SI-NEXT: s_add_i32 s23, s23, 0x30000 +; SI-NEXT: s_add_i32 s24, s24, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s26, 35 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: v_writelane_b32 v41, s25, 36 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s24, 16 +; SI-NEXT: s_and_b32 s73, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s23, 16 +; SI-NEXT: s_and_b32 s63, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s98, s22, 16 +; SI-NEXT: s_and_b32 s62, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s96, s21, 16 +; SI-NEXT: s_and_b32 s99, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s18, 16 +; SI-NEXT: s_and_b32 s86, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s20, 16 +; SI-NEXT: s_and_b32 s82, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s17, 16 +; SI-NEXT: s_and_b32 s71, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s6, 16 +; SI-NEXT: s_and_b32 s69, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s16, 16 +; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s19, 16 +; SI-NEXT: s_and_b32 s65, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s15, 16 +; SI-NEXT: s_and_b32 s53, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s13, 16 +; SI-NEXT: s_and_b32 s51, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s11, 16 ; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s89, s9, 16 -; SI-NEXT: s_and_b32 s50, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s8, 16 -; SI-NEXT: s_and_b32 s91, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s10, 16 -; SI-NEXT: s_and_b32 s51, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s12, 16 -; SI-NEXT: s_and_b32 s52, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s71, s14, 16 -; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s20, s40, 16 -; SI-NEXT: s_and_b32 s54, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s42, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s44, 16 -; SI-NEXT: s_and_b32 s64, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s46, 16 -; SI-NEXT: s_and_b32 s65, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s82, s56, 16 -; SI-NEXT: s_and_b32 s66, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s58, 16 -; SI-NEXT: s_and_b32 s67, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s75, 16 -; SI-NEXT: s_and_b32 s68, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s76, 16 -; SI-NEXT: s_and_b32 s69, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s5, 16 -; SI-NEXT: s_and_b32 s26, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s5, s4, 16 -; SI-NEXT: v_writelane_b32 v41, s6, 36 +; SI-NEXT: s_lshl_b32 s48, s9, 16 +; SI-NEXT: s_and_b32 s39, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s7, 16 +; SI-NEXT: s_and_b32 s37, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s8, 16 +; SI-NEXT: s_and_b32 s35, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s10, 16 +; SI-NEXT: s_and_b32 s34, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s12, 16 +; SI-NEXT: s_and_b32 s30, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s14, 16 +; SI-NEXT: s_and_b32 s94, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s40, 16 +; SI-NEXT: s_and_b32 s92, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s42, 16 +; SI-NEXT: s_and_b32 s90, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s44, 16 +; SI-NEXT: s_and_b32 s28, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s46, 16 +; SI-NEXT: s_and_b32 s26, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s56, 16 +; SI-NEXT: s_and_b32 s24, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s58, 16 +; SI-NEXT: s_and_b32 s22, vcc_hi, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, vcc_hi, 16 +; SI-NEXT: s_and_b32 s20, vcc_lo, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, vcc_lo, 16 +; SI-NEXT: s_and_b32 s19, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s5, 16 +; SI-NEXT: s_and_b32 s5, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_writelane_b32 v41, s25, 37 ; SI-NEXT: .LBB107_5: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 -; SI-NEXT: v_readlane_b32 s4, v41, 25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 31 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 32 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_readlane_b32 s6, v41, 33 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 -; SI-NEXT: v_readlane_b32 s4, v41, 26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_readlane_b32 s6, v41, 35 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 -; SI-NEXT: v_readlane_b32 s4, v41, 27 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 36 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: v_readlane_b32 s6, v41, 37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 -; SI-NEXT: v_readlane_b32 s4, v41, 28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_readlane_b32 s4, v41, 29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_readlane_b32 s4, v41, 30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_readlane_b32 s4, v41, 31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_readlane_b32 s4, v41, 32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_readlane_b32 s4, v41, 33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_readlane_b32 s4, v41, 34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s82 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s81 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_readlane_b32 s4, v41, 35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s71 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_readlane_b32 s4, v41, 36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_readlane_b32 s30, v40, 34 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index c6211aae19c1b..22dd3a0438136 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -2853,50 +2853,52 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -2912,78 +2914,80 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -7392,50 +7396,52 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -7451,78 +7457,80 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -11581,50 +11589,52 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -11640,78 +11650,80 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -15349,50 +15361,52 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshr_b64 v[0:1], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_lshr_b64 v[16:17], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v3, v16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -15408,78 +15422,80 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_lshrrev_b64 v[5:6], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mov_b32_e32 v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -18719,66 +18735,68 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v6, v7, v3, 16 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[8:9], v[5:6], 16 -; SI-NEXT: v_alignbit_b32 v4, v12, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v10 -; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v1, v11 +; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: v_mov_b32_e32 v5, v12 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v8bf16_to_v8i16_scalar: @@ -18790,78 +18808,80 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v7 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v7 +; VI-NEXT: v_bfe_u32 v8, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v6 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v7, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 +; VI-NEXT: v_mov_b32_e32 v3, v6 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -21770,78 +21790,80 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v7, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v7 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v7 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v7 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v7 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v7 +; VI-NEXT: v_bfe_u32 v8, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v6 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v7, s4, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 +; VI-NEXT: v_mov_b32_e32 v3, v6 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -24532,92 +24554,92 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v19, v1, v16, 16 -; SI-NEXT: v_alignbit_b32 v20, v6, v8, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 +; SI-NEXT: v_lshr_b64 v[19:20], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshr_b64 v[22:23], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v29 +; SI-NEXT: v_lshr_b64 v[20:21], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[22:23], 16 ; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_alignbit_b32 v21, v2, v26, 16 -; SI-NEXT: v_alignbit_b32 v22, v14, v24, 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v22 -; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v27 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v23 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[22:23], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v21, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_alignbit_b32 v22, v14, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[22:23], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v20, v6, v1, 16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[19:20], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_lshr_b64 v[20:21], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[22:23], 16 ; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_lshr_b64 v[10:11], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshr_b64 v[17:18], v[22:23], 24 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 ; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v20 -; SI-NEXT: v_mov_b32_e32 v8, v21 +; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: v_mov_b32_e32 v8, v22 ; SI-NEXT: v_mov_b32_e32 v9, v11 ; SI-NEXT: v_mov_b32_e32 v11, v17 -; SI-NEXT: v_mov_b32_e32 v12, v22 +; SI-NEXT: v_mov_b32_e32 v12, v23 +; SI-NEXT: v_mov_b32_e32 v13, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB109_2 @@ -24628,142 +24650,143 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cmp_lg_u32 s20, 0 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s10, s19, 24 -; VI-NEXT: s_lshr_b32 s11, s19, 16 -; VI-NEXT: s_lshr_b32 s13, s19, 8 -; VI-NEXT: s_lshr_b32 s12, s18, 16 -; VI-NEXT: s_lshr_b32 s14, s18, 8 -; VI-NEXT: s_lshr_b32 s15, s17, 24 -; VI-NEXT: s_lshr_b32 s20, s17, 16 -; VI-NEXT: s_lshr_b32 s22, s17, 8 -; VI-NEXT: s_lshr_b32 s21, s16, 16 -; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b32 s21, s19, 24 +; VI-NEXT: s_lshr_b32 s20, s19, 16 +; VI-NEXT: s_lshr_b32 s15, s19, 8 +; VI-NEXT: s_lshr_b32 s23, s18, 16 +; VI-NEXT: s_lshr_b32 s22, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b32 s13, s16, 8 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v6, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v6 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v6 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v6 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v6 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v6 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v18, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v6 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v6 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v17, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v16, v0, v1, 16 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v6 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v0 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr23 -; VI-NEXT: ; implicit-def: $sgpr21 -; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr22 -; VI-NEXT: ; implicit-def: $sgpr20 -; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 ; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr23 ; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr21 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v18, s16 -; VI-NEXT: v_mov_b32_e32 v19, s17 -; VI-NEXT: v_mov_b32_e32 v16, s18 -; VI-NEXT: v_mov_b32_e32 v17, s19 -; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: v_mov_b32_e32 v2, s21 -; VI-NEXT: v_mov_b32_e32 v5, s22 -; VI-NEXT: v_mov_b32_e32 v6, s20 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v9, s14 -; VI-NEXT: v_mov_b32_e32 v10, s12 -; VI-NEXT: v_mov_b32_e32 v13, s13 -; VI-NEXT: v_mov_b32_e32 v14, s11 -; VI-NEXT: v_mov_b32_e32 v15, s10 -; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v13, s15 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v11, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v0, v18 -; VI-NEXT: v_mov_b32_e32 v4, v19 -; VI-NEXT: v_mov_b32_e32 v8, v16 -; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_mov_b32_e32 v1, v11 +; VI-NEXT: v_mov_b32_e32 v11, v16 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8bf16_to_v16i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index a48eb27460f7d..d48c6833d33a0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -4052,90 +4052,92 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -4151,150 +4153,153 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -11204,90 +11209,92 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -11303,150 +11310,153 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -17924,90 +17934,92 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -18023,150 +18035,153 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -24092,90 +24107,92 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 +; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; SI-NEXT: v_lshr_b64 v[4:5], v[14:15], 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshr_b64 v[32:33], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v7, v32 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v7, v8 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -24191,150 +24208,153 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v8 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v8 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 -; VI-NEXT: v_add_f32_e32 v8, s4, v0 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v8 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v8 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] +; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_mov_b32_e32 v3, v9 +; VI-NEXT: v_mov_b32_e32 v5, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -29752,120 +29772,124 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v10, v11, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v14, v15, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshr_b64 v[8:9], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshr_b64 v[12:13], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_lshr_b64 v[17:18], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 16 -; SI-NEXT: v_alignbit_b32 v12, v24, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[21:22], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[19:20], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[10:11], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[13:14], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v17 -; SI-NEXT: v_mov_b32_e32 v5, v18 -; SI-NEXT: v_mov_b32_e32 v9, v21 -; SI-NEXT: v_mov_b32_e32 v13, v19 +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v5, v24 +; SI-NEXT: v_mov_b32_e32 v7, v20 +; SI-NEXT: v_mov_b32_e32 v9, v25 +; SI-NEXT: v_mov_b32_e32 v11, v18 +; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB95_2 ; @@ -29878,150 +29902,154 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_mov_b32_e32 v10, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v10 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v10 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v10 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v11, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v10 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v13, s5, v10 ; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s5, v10 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s5, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v14, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v10 +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v5, v13, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v13 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v7, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v10 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v15, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v10 +; VI-NEXT: v_bfe_u32 v10, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v7 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v15, v10, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v7 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v13 +; VI-NEXT: v_mov_b32_e32 v7, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -35136,150 +35164,154 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_mov_b32_e32 v10, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v10 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v10 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v10 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v11, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v10 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_f32_e32 v13, s5, v10 ; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s5, v10 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s5, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v14, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v10 +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v5, v13, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v13 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_add_f32_e32 v13, s4, v10 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v13 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v7, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v10 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v15, vcc +; VI-NEXT: v_add_f32_e32 v7, s4, v10 +; VI-NEXT: v_bfe_u32 v10, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v7 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v15, v10, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v7 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v13 +; VI-NEXT: v_mov_b32_e32 v7, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -40102,187 +40134,206 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI-LABEL: bitcast_v16bf16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s25 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v40, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v43, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_alignbit_b32 v48, v1, v32, 16 -; SI-NEXT: v_alignbit_b32 v49, v6, v16, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: v_alignbit_b32 v37, v2, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_alignbit_b32 v35, v2, v40, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_alignbit_b32 v38, v14, v50, 16 -; SI-NEXT: v_alignbit_b32 v36, v22, v54, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v43, 16 -; SI-NEXT: v_alignbit_b32 v34, v30, v0, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v53 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v46 +; SI-NEXT: v_lshr_b64 v[53:54], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshr_b64 v[39:40], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v60 +; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v45 +; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[53:54], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 24 +; SI-NEXT: v_lshr_b64 v[34:35], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v57 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 +; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[42:43], 24 +; SI-NEXT: v_lshr_b64 v[37:38], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v34, v30, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[42:43], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v36, v22, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_lshr_b64 v[39:40], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v38, v14, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_lshr_b64 v[53:54], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v48, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v49, v6, v0, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[53:54], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 24 +; SI-NEXT: v_lshr_b64 v[34:35], v[39:40], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[42:43], 24 +; SI-NEXT: v_lshr_b64 v[37:38], v[42:43], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v49 -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: v_mov_b32_e32 v12, v38 -; SI-NEXT: v_mov_b32_e32 v16, v35 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v13, v20 +; SI-NEXT: v_mov_b32_e32 v20, v40 +; SI-NEXT: v_mov_b32_e32 v24, v42 +; SI-NEXT: v_mov_b32_e32 v28, v43 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v50 +; SI-NEXT: v_mov_b32_e32 v2, v48 +; SI-NEXT: v_mov_b32_e32 v4, v51 +; SI-NEXT: v_mov_b32_e32 v5, v10 +; SI-NEXT: v_mov_b32_e32 v8, v53 +; SI-NEXT: v_mov_b32_e32 v10, v36 +; SI-NEXT: v_mov_b32_e32 v12, v54 +; SI-NEXT: v_mov_b32_e32 v16, v39 +; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: v_mov_b32_e32 v21, v33 +; SI-NEXT: v_mov_b32_e32 v26, v37 +; SI-NEXT: v_mov_b32_e32 v29, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar: @@ -40291,26 +40342,26 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cmp_lg_u32 s24, 0 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s14, s23, 24 -; VI-NEXT: s_lshr_b32 s15, s23, 16 -; VI-NEXT: s_lshr_b32 s25, s23, 8 -; VI-NEXT: s_lshr_b32 s24, s22, 16 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: s_lshr_b32 s27, s21, 24 -; VI-NEXT: s_lshr_b32 s28, s21, 16 -; VI-NEXT: s_lshr_b32 s40, s21, 8 -; VI-NEXT: s_lshr_b32 s29, s20, 16 -; VI-NEXT: s_lshr_b32 s41, s20, 8 -; VI-NEXT: s_lshr_b32 s42, s19, 24 -; VI-NEXT: s_lshr_b32 s43, s19, 16 -; VI-NEXT: s_lshr_b32 s45, s19, 8 -; VI-NEXT: s_lshr_b32 s44, s18, 16 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: s_lshr_b32 s47, s17, 24 -; VI-NEXT: s_lshr_b32 s56, s17, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 8 -; VI-NEXT: s_lshr_b32 s57, s16, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b32 s57, s23, 24 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s47, s23, 8 +; VI-NEXT: s_lshr_b32 s59, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s22, 8 +; VI-NEXT: s_lshr_b32 s44, s21, 24 +; VI-NEXT: s_lshr_b32 s43, s21, 16 +; VI-NEXT: s_lshr_b32 s42, s21, 8 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: s_lshr_b32 s45, s20, 8 +; VI-NEXT: s_lshr_b32 s29, s19, 24 +; VI-NEXT: s_lshr_b32 s28, s19, 16 +; VI-NEXT: s_lshr_b32 s27, s19, 8 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s40, s18, 8 +; VI-NEXT: s_lshr_b32 s24, s17, 24 +; VI-NEXT: s_lshr_b32 s15, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: s_lshr_b32 s25, s16, 8 ; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 ; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 @@ -40336,225 +40387,225 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] ; VI-NEXT: v_add_f32_e32 v0, s4, v2 -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v2 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v0, 16 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v9, v4, v3, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v8, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v16, v4, v3, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[5:6] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_add_f32_e32 v2, s4, v2 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v24, v2, v3, 16 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v25, v28 +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: v_mov_b32_e32 v17, v20 +; VI-NEXT: v_lshrrev_b64 v[36:37], 24, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr45 -; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr28 -; VI-NEXT: ; implicit-def: $sgpr27 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr25 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v9, s19 -; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v26, s59 +; VI-NEXT: v_mov_b32_e32 v25, s58 +; VI-NEXT: v_mov_b32_e32 v31, s57 +; VI-NEXT: v_mov_b32_e32 v30, s56 +; VI-NEXT: v_mov_b32_e32 v29, s47 +; VI-NEXT: v_mov_b32_e32 v18, s46 +; VI-NEXT: v_mov_b32_e32 v17, s45 +; VI-NEXT: v_mov_b32_e32 v23, s44 +; VI-NEXT: v_mov_b32_e32 v22, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v10, s41 +; VI-NEXT: v_mov_b32_e32 v9, s40 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v2, s26 +; VI-NEXT: v_mov_b32_e32 v1, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 ; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v35, s59 -; VI-NEXT: v_mov_b32_e32 v2, s57 -; VI-NEXT: v_mov_b32_e32 v5, s58 -; VI-NEXT: v_mov_b32_e32 v6, s56 -; VI-NEXT: v_mov_b32_e32 v7, s47 -; VI-NEXT: v_mov_b32_e32 v34, s46 -; VI-NEXT: v_mov_b32_e32 v10, s44 -; VI-NEXT: v_mov_b32_e32 v13, s45 -; VI-NEXT: v_mov_b32_e32 v14, s43 -; VI-NEXT: v_mov_b32_e32 v15, s42 -; VI-NEXT: v_mov_b32_e32 v33, s41 -; VI-NEXT: v_mov_b32_e32 v18, s29 -; VI-NEXT: v_mov_b32_e32 v21, s40 -; VI-NEXT: v_mov_b32_e32 v22, s28 -; VI-NEXT: v_mov_b32_e32 v23, s27 -; VI-NEXT: v_mov_b32_e32 v32, s26 -; VI-NEXT: v_mov_b32_e32 v26, s24 -; VI-NEXT: v_mov_b32_e32 v29, s25 -; VI-NEXT: v_mov_b32_e32 v30, s15 -; VI-NEXT: v_mov_b32_e32 v31, s14 -; VI-NEXT: v_mov_b32_e32 v27, s10 -; VI-NEXT: v_mov_b32_e32 v19, s8 -; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v36, s10 +; VI-NEXT: v_mov_b32_e32 v37, s8 +; VI-NEXT: v_mov_b32_e32 v34, s6 +; VI-NEXT: v_mov_b32_e32 v32, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v12, v9 -; VI-NEXT: v_mov_b32_e32 v20, v17 -; VI-NEXT: v_mov_b32_e32 v28, v25 -; VI-NEXT: v_mov_b32_e32 v1, v35 -; VI-NEXT: v_mov_b32_e32 v9, v34 -; VI-NEXT: v_mov_b32_e32 v17, v33 -; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: v_mov_b32_e32 v3, v32 +; VI-NEXT: v_mov_b32_e32 v11, v34 +; VI-NEXT: v_mov_b32_e32 v19, v37 +; VI-NEXT: v_mov_b32_e32 v27, v36 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 73b57a52201af..8055ea8be5261 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -1392,20 +1392,20 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: @@ -1421,24 +1421,24 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; VI-NEXT: s_cbranch_execnz .LBB15_4 ; VI-NEXT: .LBB15_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_3: ; VI-NEXT: s_branch .LBB15_2 @@ -3671,20 +3671,20 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB35_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: @@ -3700,24 +3700,24 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; VI-NEXT: s_cbranch_execnz .LBB35_4 ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB35_3: ; VI-NEXT: s_branch .LBB35_2 @@ -5581,24 +5581,25 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s17 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 ; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB51_2 ; @@ -5611,24 +5612,24 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB51_4 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_3: ; VI-NEXT: s_branch .LBB51_2 @@ -7278,24 +7279,24 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB63_4 ; VI-NEXT: .LBB63_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB63_3: ; VI-NEXT: s_branch .LBB63_2 @@ -8720,20 +8721,20 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_3 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: .LBB73_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB73_4: @@ -8749,24 +8750,24 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB73_4 ; VI-NEXT: .LBB73_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB73_3: ; VI-NEXT: s_branch .LBB73_2 @@ -9336,30 +9337,31 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v1, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB77_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB77_2 ; @@ -9369,9 +9371,9 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; VI-NEXT: s_cmp_lg_u32 s17, 0 ; VI-NEXT: s_cbranch_scc0 .LBB77_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s7, s16, 24 -; VI-NEXT: s_lshr_b32 s6, s16, 16 -; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB77_4 ; VI-NEXT: .LBB77_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 @@ -9392,21 +9394,21 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB77_3: +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB77_2 ; VI-NEXT: .LBB77_4: -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index d5d2d4aafaa19..08038b90687c0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -290,34 +290,34 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB1_4 ; VI-NEXT: .LBB1_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB1_3: ; VI-NEXT: s_branch .LBB1_2 @@ -964,16 +964,16 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -992,34 +992,34 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB5_4 ; VI-NEXT: .LBB5_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB5_3: ; VI-NEXT: s_branch .LBB5_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 1a76b23df5bb5..0618b51d916e5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -6494,173 +6494,212 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -6670,10 +6709,10 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -6682,295 +6721,303 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB23_5 ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -6992,10 +7039,10 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB23_5: ; %end -; VI-NEXT: v_readlane_b32 s30, v19, 0 -; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -21379,173 +21426,212 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -21555,10 +21641,10 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -21567,295 +21653,303 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB47_5 ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -21877,10 +21971,10 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB47_5: ; %end -; VI-NEXT: v_readlane_b32 s30, v19, 0 -; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -35772,173 +35866,212 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -35948,10 +36081,10 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -35960,295 +36093,303 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB67_5 ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -36270,10 +36411,10 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB67_5: ; %end -; VI-NEXT: v_readlane_b32 s30, v19, 0 -; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -49225,173 +49366,212 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 +; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshr_b64 v[7:8], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_lshr_b64 v[8:9], v[30:31], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_lshr_b64 v[9:10], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshr_b64 v[10:11], v[26:27], 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v44 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v43 +; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 ; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -49401,10 +49581,10 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v19, s30, 0 -; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: v_readfirstlane_b32 s30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -49413,295 +49593,303 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 ; VI-NEXT: s_lshl_b32 s4, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v0 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: s_and_b32 s5, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s24, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s20, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s18, 0xffff0000 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s5, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v0 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s6, v16 +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e64 v17, v1, v5, s[4:5] +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_cndmask_b32_e64 v17, v5, v7, s[4:5] +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_bfe_u32 v11, v5, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e64 v17, v7, v9, s[4:5] +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e64 v17, v9, v11, s[4:5] +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v15, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s6, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_bfe_u32 v15, v9, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v15, v17, vcc +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_cndmask_b32_e64 v17, v11, v13, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s6, v16 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s7, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_cndmask_b32_e64 v17, v13, v15, s[4:5] +; VI-NEXT: v_add_f32_e32 v13, s7, v16 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s6, s31, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[17:18] +; VI-NEXT: v_mov_b32_e32 v13, v15 +; VI-NEXT: v_mov_b32_e32 v15, v16 ; VI-NEXT: s_branch .LBB83_5 ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -49723,10 +49911,10 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_mov_b32_e32 v14, s30 ; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: .LBB83_5: ; %end -; VI-NEXT: v_readlane_b32 s30, v19, 0 -; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: v_readlane_b32 s31, v20, 1 ; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -61972,189 +62160,204 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshr_b64 v[12:13], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_alignbit_b32 v12, v10, v2, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshr_b64 v[16:17], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v16, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshr_b64 v[20:21], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_alignbit_b32 v20, v14, v2, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshr_b64 v[24:25], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_alignbit_b32 v24, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshr_b64 v[28:29], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 -; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_lshr_b64 v[38:39], v[38:39], 16 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v30, v31, v15, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshr_b64 v[18:19], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshr_b64 v[6:7], v[50:51], 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; SI-NEXT: v_lshr_b64 v[10:11], v[48:49], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshr_b64 v[41:42], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[35:36], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshr_b64 v[42:43], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshr_b64 v[43:44], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[21:22], 16 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_lshr_b64 v[33:34], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[29:30], 16 -; SI-NEXT: v_alignbit_b32 v28, v40, v41, 16 +; SI-NEXT: v_lshr_b64 v[45:46], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 ; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: v_mov_b32_e32 v5, v40 +; SI-NEXT: v_mov_b32_e32 v9, v41 +; SI-NEXT: v_mov_b32_e32 v13, v42 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v21, v44 +; SI-NEXT: v_mov_b32_e32 v25, v45 +; SI-NEXT: v_mov_b32_e32 v29, v46 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -62171,48 +62374,55 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v1, v33 -; SI-NEXT: v_mov_b32_e32 v5, v34 -; SI-NEXT: v_mov_b32_e32 v9, v35 -; SI-NEXT: v_mov_b32_e32 v13, v36 -; SI-NEXT: v_mov_b32_e32 v17, v37 -; SI-NEXT: v_mov_b32_e32 v21, v38 -; SI-NEXT: v_mov_b32_e32 v25, v50 -; SI-NEXT: v_mov_b32_e32 v29, v48 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v3, v53 +; SI-NEXT: v_mov_b32_e32 v7, v51 +; SI-NEXT: v_mov_b32_e32 v11, v49 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_mov_b32_e32 v23, v36 +; SI-NEXT: v_mov_b32_e32 v27, v34 +; SI-NEXT: v_mov_b32_e32 v31, v32 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB95_2 ; @@ -62232,295 +62442,302 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v16 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v4, 16, 1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v5, s7, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v5 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_bfe_u32 v0, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s5, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s5, v16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v11, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v7 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_and_b32 s5, s25, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s5, v16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: s_and_b32 s5, s27, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v11, s5, v16 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v15, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: s_and_b32 s5, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -74867,295 +75084,302 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v1 -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_lshl_b32 s5, s30, 16 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s5, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s5, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s5, s31, 16 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s5, v1 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s5, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_mov_b32_e32 v16, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v16 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v16 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v16 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[2:3] +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v4, 16, 1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v2, 16, 1 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v16 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f32_e32 v5, s7, v16 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v0, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v5 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v3, s6, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_bfe_u32 v0, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7fff, v0 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_and_b32 s5, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_bfe_u32 v1, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s5, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v1 -; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 -; VI-NEXT: v_add_f32_e32 v16, s4, v1 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_f32_e32 v3, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v5, s5, v16 +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v16 +; VI-NEXT: v_mov_b32_e32 v3, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v7, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; VI-NEXT: v_bfe_u32 v11, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v7 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: s_and_b32 s5, s25, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v9, s5, v16 +; VI-NEXT: v_mov_b32_e32 v5, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: s_and_b32 s5, s27, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v11, s5, v16 +; VI-NEXT: v_mov_b32_e32 v7, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v11, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; VI-NEXT: v_bfe_u32 v15, v11, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v11 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: s_and_b32 s5, s29, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v9, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v13, s5, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_mov_b32_e32 v11, v17 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s4, v1 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v13, s4, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc +; VI-NEXT: v_add_f32_e32 v15, s4, v16 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[17:18], 16, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] +; VI-NEXT: v_mov_b32_e32 v13, v17 ; VI-NEXT: s_branch .LBB103_5 ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -87523,1321 +87747,1481 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s34, 0 +; SI-NEXT: v_writelane_b32 v40, s35, 1 +; SI-NEXT: v_writelane_b32 v40, s36, 2 +; SI-NEXT: v_writelane_b32 v40, s37, 3 +; SI-NEXT: v_writelane_b32 v40, s38, 4 +; SI-NEXT: v_writelane_b32 v40, s39, 5 +; SI-NEXT: v_writelane_b32 v40, s48, 6 +; SI-NEXT: v_writelane_b32 v40, s49, 7 +; SI-NEXT: v_writelane_b32 v40, s50, 8 +; SI-NEXT: v_writelane_b32 v40, s51, 9 +; SI-NEXT: v_writelane_b32 v40, s52, 10 +; SI-NEXT: v_writelane_b32 v40, s53, 11 +; SI-NEXT: v_writelane_b32 v40, s54, 12 +; SI-NEXT: v_writelane_b32 v40, s55, 13 +; SI-NEXT: v_writelane_b32 v40, s64, 14 +; SI-NEXT: v_writelane_b32 v40, s65, 15 +; SI-NEXT: v_writelane_b32 v40, s66, 16 +; SI-NEXT: v_writelane_b32 v40, s67, 17 +; SI-NEXT: v_writelane_b32 v40, s68, 18 +; SI-NEXT: v_writelane_b32 v40, s69, 19 +; SI-NEXT: v_writelane_b32 v40, s70, 20 +; SI-NEXT: v_writelane_b32 v40, s71, 21 +; SI-NEXT: v_writelane_b32 v40, s80, 22 +; SI-NEXT: v_writelane_b32 v40, s81, 23 +; SI-NEXT: v_writelane_b32 v40, s82, 24 +; SI-NEXT: v_writelane_b32 v40, s83, 25 +; SI-NEXT: v_writelane_b32 v40, s84, 26 +; SI-NEXT: v_writelane_b32 v40, s85, 27 +; SI-NEXT: v_writelane_b32 v40, s86, 28 +; SI-NEXT: v_writelane_b32 v40, s87, 29 +; SI-NEXT: v_writelane_b32 v40, s96, 30 +; SI-NEXT: v_writelane_b32 v40, s97, 31 +; SI-NEXT: v_writelane_b32 v40, s98, 32 +; SI-NEXT: v_writelane_b32 v40, s99, 33 +; SI-NEXT: v_writelane_b32 v40, s30, 34 +; SI-NEXT: v_writelane_b32 v40, s31, 35 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v9 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s28 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_alignbit_b32 v23, v1, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_alignbit_b32 v20, v1, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_alignbit_b32 v17, v1, v38, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 -; SI-NEXT: v_alignbit_b32 v14, v1, v55, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_alignbit_b32 v11, v1, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v1, v46, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_alignbit_b32 v21, v19, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35 -; SI-NEXT: v_alignbit_b32 v4, v1, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_alignbit_b32 v18, v16, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v57 -; SI-NEXT: v_alignbit_b32 v3, v1, v37, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_alignbit_b32 v24, v22, v2, 16 -; SI-NEXT: v_alignbit_b32 v15, v13, v27, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v49, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v43, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v60, 16 -; SI-NEXT: v_alignbit_b32 v2, v1, v34, 16 -; SI-NEXT: v_readfirstlane_b32 s8, v23 -; SI-NEXT: v_readfirstlane_b32 s9, v24 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_readfirstlane_b32 s42, v11 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s56, v8 -; SI-NEXT: v_readfirstlane_b32 s57, v9 -; SI-NEXT: v_readfirstlane_b32 s62, v4 -; SI-NEXT: v_readfirstlane_b32 s63, v5 -; SI-NEXT: v_readfirstlane_b32 s76, v3 -; SI-NEXT: v_readfirstlane_b32 s77, v2 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v40 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v57 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s4, 0 +; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v22 +; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v29 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: v_readfirstlane_b32 s24, v21 +; SI-NEXT: v_readfirstlane_b32 s40, v23 +; SI-NEXT: v_readfirstlane_b32 s18, v27 +; SI-NEXT: v_readfirstlane_b32 s10, v31 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_mov_b32 s61, s62 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s17, s22 +; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s13, s14 +; SI-NEXT: s_mov_b32 s7, s8 +; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: s_lshr_b32 s18, s56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 +; SI-NEXT: s_lshr_b32 s17, s42, 8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v26 +; SI-NEXT: s_lshr_b32 s15, s22, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; SI-NEXT: s_lshr_b32 s10, s28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18 +; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 +; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 +; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v21 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b32 s25, s4, 16 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s44, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s45, s4, 16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v3, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s58, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 -; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_lshr_b64 s[60:61], s[58:59], 16 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_readfirstlane_b32 s58, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 -; SI-NEXT: v_readfirstlane_b32 s76, v3 -; SI-NEXT: v_readfirstlane_b32 s77, v2 -; SI-NEXT: v_readfirstlane_b32 s62, v4 -; SI-NEXT: v_readfirstlane_b32 s63, v5 -; SI-NEXT: v_readfirstlane_b32 s56, v8 -; SI-NEXT: v_readfirstlane_b32 s57, v9 -; SI-NEXT: v_readfirstlane_b32 s42, v11 -; SI-NEXT: v_readfirstlane_b32 s43, v12 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 -; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v36 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s20, v17 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_alignbit_b32 v20, v20, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v23, v23, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 -; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 -; SI-NEXT: v_readfirstlane_b32 s27, v15 -; SI-NEXT: v_readfirstlane_b32 s21, v18 -; SI-NEXT: v_readfirstlane_b32 s14, v20 -; SI-NEXT: v_readfirstlane_b32 s8, v23 -; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_alignbit_b32 v21, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v24, v22, v24, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v21 -; SI-NEXT: v_readfirstlane_b32 s9, v24 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_readfirstlane_b32 s44, v9 +; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[74:75], 24 +; SI-NEXT: s_mov_b32 s7, s8 +; SI-NEXT: s_mov_b32 s13, s14 +; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s17, s22 +; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_mov_b32 s61, s62 +; SI-NEXT: v_writelane_b32 v41, s78, 0 +; SI-NEXT: v_writelane_b32 v41, s79, 1 +; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: s_lshr_b32 s18, s56, 8 +; SI-NEXT: s_lshr_b32 s17, s42, 8 +; SI-NEXT: s_lshr_b32 s15, s22, 8 +; SI-NEXT: s_lshr_b32 s10, s28, 8 +; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v23, s5, v23 -; SI-NEXT: s_and_b32 s5, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s4, s4, 24 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v23, s4, v23 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s7, s74, 0xff +; SI-NEXT: s_lshl_b32 s13, s92, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s88, 0xff +; SI-NEXT: v_readlane_b32 s74, v41, 0 +; SI-NEXT: s_lshl_b32 s21, s74, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b32 s7, s76, 0xff +; SI-NEXT: s_lshl_b32 s13, s24, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s73, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48 +; SI-NEXT: v_or_b32_e32 v2, s13, v2 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v2, s7, v2 +; SI-NEXT: s_and_b32 s7, s60, 0xff +; SI-NEXT: s_lshl_b32 s13, s30, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s94, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s21, s90, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v27 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_lshl_b32 s4, s16, 8 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v30 -; SI-NEXT: v_or_b32_e32 v20, s4, v20 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s8, 24 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v20, s4, v20 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s62, 0xff +; SI-NEXT: s_lshl_b32 s13, s23, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v39 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s46, 0xff +; SI-NEXT: s_lshl_b32 s13, s38, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s36, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s21, s34, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s21, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_lshl_b32 s4, s22, 8 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 -; SI-NEXT: v_or_b32_e32 v17, s4, v17 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s14, 24 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v17, s4, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s56, 0xff +; SI-NEXT: s_lshl_b32 s13, s18, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s45, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v38 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s13, s52, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s50, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s18, s48, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s18, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_lshl_b32 s4, s28, 8 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v31 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s20, 24 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_lshl_b32 s13, s17, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v37 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s13, s66, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s64, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s16, s54, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s16, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v59 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_lshl_b32 s4, s44, 8 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v63 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s26, 24 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s13, s15, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s41, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v34 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s13, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s13, s78, 8 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_and_b32 s13, s70, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s15, s68, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v47 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_lshl_b32 s4, s58, 8 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v56 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s42, 24 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v41 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_lshl_b32 s4, s72, 8 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s56, 24 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s19, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s7, s12, 0xff +; SI-NEXT: s_lshl_b32 s10, s98, 8 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s10, s96, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s86, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s10, s12, s10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s14, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_lshl_b32 s4, s76, 8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s62, 24 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s11, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s9, v1 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s7, s84, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s82, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s80, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s8, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 8 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: v_readlane_b32 s30, v40, 34 +; SI-NEXT: v_readlane_b32 s75, v41, 1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s31, v40, 35 +; SI-NEXT: v_readlane_b32 s99, v40, 33 +; SI-NEXT: v_readlane_b32 s98, v40, 32 +; SI-NEXT: v_readlane_b32 s97, v40, 31 +; SI-NEXT: v_readlane_b32 s96, v40, 30 +; SI-NEXT: v_readlane_b32 s87, v40, 29 +; SI-NEXT: v_readlane_b32 s86, v40, 28 +; SI-NEXT: v_readlane_b32 s85, v40, 27 +; SI-NEXT: v_readlane_b32 s84, v40, 26 +; SI-NEXT: v_readlane_b32 s83, v40, 25 +; SI-NEXT: v_readlane_b32 s82, v40, 24 +; SI-NEXT: v_readlane_b32 s81, v40, 23 +; SI-NEXT: v_readlane_b32 s80, v40, 22 +; SI-NEXT: v_readlane_b32 s71, v40, 21 +; SI-NEXT: v_readlane_b32 s70, v40, 20 +; SI-NEXT: v_readlane_b32 s69, v40, 19 +; SI-NEXT: v_readlane_b32 s68, v40, 18 +; SI-NEXT: v_readlane_b32 s67, v40, 17 +; SI-NEXT: v_readlane_b32 s66, v40, 16 +; SI-NEXT: v_readlane_b32 s65, v40, 15 +; SI-NEXT: v_readlane_b32 s64, v40, 14 +; SI-NEXT: v_readlane_b32 s55, v40, 13 +; SI-NEXT: v_readlane_b32 s54, v40, 12 +; SI-NEXT: v_readlane_b32 s53, v40, 11 +; SI-NEXT: v_readlane_b32 s52, v40, 10 +; SI-NEXT: v_readlane_b32 s51, v40, 9 +; SI-NEXT: v_readlane_b32 s50, v40, 8 +; SI-NEXT: v_readlane_b32 s49, v40, 7 +; SI-NEXT: v_readlane_b32 s48, v40, 6 +; SI-NEXT: v_readlane_b32 s39, v40, 5 +; SI-NEXT: v_readlane_b32 s38, v40, 4 +; SI-NEXT: v_readlane_b32 s37, v40, 3 +; SI-NEXT: v_readlane_b32 s36, v40, 2 +; SI-NEXT: v_readlane_b32 s35, v40, 1 +; SI-NEXT: v_readlane_b32 s34, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s4, 0 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_writelane_b32 v63, s34, 0 -; VI-NEXT: v_writelane_b32 v63, s35, 1 -; VI-NEXT: v_writelane_b32 v63, s36, 2 -; VI-NEXT: v_writelane_b32 v63, s37, 3 -; VI-NEXT: v_writelane_b32 v63, s38, 4 -; VI-NEXT: v_writelane_b32 v63, s39, 5 -; VI-NEXT: v_writelane_b32 v63, s48, 6 -; VI-NEXT: v_writelane_b32 v63, s49, 7 -; VI-NEXT: v_writelane_b32 v63, s50, 8 -; VI-NEXT: v_writelane_b32 v63, s51, 9 -; VI-NEXT: v_writelane_b32 v63, s52, 10 -; VI-NEXT: v_writelane_b32 v63, s53, 11 -; VI-NEXT: v_writelane_b32 v63, s54, 12 -; VI-NEXT: v_writelane_b32 v63, s55, 13 -; VI-NEXT: v_writelane_b32 v63, s64, 14 -; VI-NEXT: v_writelane_b32 v63, s65, 15 -; VI-NEXT: v_writelane_b32 v63, s66, 16 -; VI-NEXT: v_writelane_b32 v63, s67, 17 -; VI-NEXT: v_writelane_b32 v63, s30, 18 -; VI-NEXT: v_writelane_b32 v63, s31, 19 +; VI-NEXT: v_writelane_b32 v4, s34, 0 +; VI-NEXT: v_writelane_b32 v4, s35, 1 +; VI-NEXT: v_writelane_b32 v4, s36, 2 +; VI-NEXT: v_writelane_b32 v4, s37, 3 +; VI-NEXT: v_writelane_b32 v4, s38, 4 +; VI-NEXT: v_writelane_b32 v4, s39, 5 +; VI-NEXT: v_writelane_b32 v4, s48, 6 +; VI-NEXT: v_writelane_b32 v4, s49, 7 +; VI-NEXT: v_writelane_b32 v4, s50, 8 +; VI-NEXT: v_writelane_b32 v4, s51, 9 +; VI-NEXT: v_writelane_b32 v4, s52, 10 +; VI-NEXT: v_writelane_b32 v4, s53, 11 +; VI-NEXT: v_writelane_b32 v4, s54, 12 +; VI-NEXT: v_writelane_b32 v4, s55, 13 +; VI-NEXT: v_writelane_b32 v4, s64, 14 +; VI-NEXT: v_writelane_b32 v4, s65, 15 +; VI-NEXT: v_writelane_b32 v4, s66, 16 +; VI-NEXT: v_writelane_b32 v4, s67, 17 +; VI-NEXT: v_writelane_b32 v4, s68, 18 +; VI-NEXT: v_writelane_b32 v4, s69, 19 +; VI-NEXT: v_writelane_b32 v4, s70, 20 +; VI-NEXT: v_writelane_b32 v4, s71, 21 +; VI-NEXT: v_writelane_b32 v4, s80, 22 +; VI-NEXT: v_writelane_b32 v4, s81, 23 +; VI-NEXT: v_writelane_b32 v4, s82, 24 +; VI-NEXT: v_writelane_b32 v4, s83, 25 +; VI-NEXT: v_writelane_b32 v4, s30, 26 +; VI-NEXT: v_writelane_b32 v4, s31, 27 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: v_readfirstlane_b32 s4, v1 ; VI-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: s_cbranch_scc0 .LBB109_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s56, s5, 24 -; VI-NEXT: s_lshr_b32 s57, s5, 16 -; VI-NEXT: s_lshr_b32 s59, s5, 8 -; VI-NEXT: s_lshr_b32 s58, s4, 16 -; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s72, s29, 8 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s27, 8 -; VI-NEXT: s_lshr_b32 s76, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 8 -; VI-NEXT: s_lshr_b32 s89, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s35, s23, 8 -; VI-NEXT: s_lshr_b32 s34, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s48, s21, 8 -; VI-NEXT: s_lshr_b32 s39, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s53, s19, 8 -; VI-NEXT: s_lshr_b32 s52, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s66, s17, 8 -; VI-NEXT: s_lshr_b32 s65, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 -; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: s_lshr_b32 s7, s5, 24 +; VI-NEXT: s_lshr_b32 s9, s5, 16 +; VI-NEXT: s_lshr_b32 s11, s5, 8 +; VI-NEXT: s_lshr_b32 s13, s4, 16 +; VI-NEXT: s_lshr_b32 s15, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s29, 24 +; VI-NEXT: s_lshr_b32 s47, s29, 16 +; VI-NEXT: s_lshr_b32 s57, s29, 8 +; VI-NEXT: s_lshr_b32 s88, s28, 16 +; VI-NEXT: s_lshr_b32 s89, s28, 8 +; VI-NEXT: s_lshr_b32 s90, s27, 24 +; VI-NEXT: s_lshr_b32 s91, s27, 16 +; VI-NEXT: s_lshr_b32 s30, s27, 8 +; VI-NEXT: s_lshr_b32 s31, s26, 16 +; VI-NEXT: s_lshr_b32 s34, s26, 8 +; VI-NEXT: s_lshr_b32 s35, s25, 24 +; VI-NEXT: s_lshr_b32 s36, s25, 16 +; VI-NEXT: s_lshr_b32 s37, s25, 8 +; VI-NEXT: s_lshr_b32 s38, s24, 16 +; VI-NEXT: s_lshr_b32 s39, s24, 8 +; VI-NEXT: s_lshr_b32 s48, s23, 24 +; VI-NEXT: s_lshr_b32 s49, s23, 16 +; VI-NEXT: s_lshr_b32 s50, s23, 8 +; VI-NEXT: s_lshr_b32 s51, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s22, 8 +; VI-NEXT: s_lshr_b32 s53, s21, 24 +; VI-NEXT: s_lshr_b32 s54, s21, 16 +; VI-NEXT: s_lshr_b32 s55, s21, 8 +; VI-NEXT: s_lshr_b32 s64, s20, 16 +; VI-NEXT: s_lshr_b32 s65, s20, 8 +; VI-NEXT: s_lshr_b32 s66, s19, 24 +; VI-NEXT: s_lshr_b32 s67, s19, 16 +; VI-NEXT: s_lshr_b32 s68, s19, 8 +; VI-NEXT: s_lshr_b32 s69, s18, 16 +; VI-NEXT: s_lshr_b32 s70, s18, 8 +; VI-NEXT: s_lshr_b32 s71, s17, 24 +; VI-NEXT: s_lshr_b32 s80, s17, 16 +; VI-NEXT: s_lshr_b32 s81, s17, 8 +; VI-NEXT: s_lshr_b32 s82, s16, 16 +; VI-NEXT: s_lshr_b32 s83, s16, 8 +; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: s_mov_b32 s6, s17 +; VI-NEXT: s_mov_b32 s8, s19 +; VI-NEXT: s_mov_b32 s10, s21 +; VI-NEXT: s_mov_b32 s12, s23 +; VI-NEXT: s_mov_b32 s14, s25 +; VI-NEXT: s_mov_b32 s40, s27 +; VI-NEXT: s_mov_b32 s46, s29 +; VI-NEXT: s_mov_b32 s56, s5 +; VI-NEXT: s_cbranch_execnz .LBB109_3 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_add_f32_e32 v14, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16 -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s9, s8 +; VI-NEXT: s_and_b32 s7, s17, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s10 +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; VI-NEXT: s_lshl_b32 s7, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s19, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s7, s10 +; VI-NEXT: s_and_b32 s7, s19, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s18, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s21, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[18:19], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s7, s9 +; VI-NEXT: s_and_b32 s7, s21, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s11, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s20, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s20, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s23, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s7, s9 +; VI-NEXT: s_and_b32 s7, s23, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s13, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s22, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s22, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[22:23], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s25, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[22:23], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s7, s9 +; VI-NEXT: s_and_b32 s7, s25, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s15, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s24, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s7, s9 +; VI-NEXT: s_and_b32 s7, s24, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[24:25], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s41, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s27, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[24:25], s[40:41], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s7, s9 +; VI-NEXT: s_and_b32 s7, s27, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s41, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s26, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[40:41], s[40:41], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s26, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[26:27], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s29, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[26:27], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s29, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s28, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 +; VI-NEXT: s_and_b32 s7, s28, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[28:29], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s9 +; VI-NEXT: s_lshr_b32 s43, s7, 16 +; VI-NEXT: s_lshl_b32 s7, s5, 16 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s9, s7, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s7 +; VI-NEXT: s_lshr_b64 s[28:29], s[42:43], 16 +; VI-NEXT: s_addk_i32 s9, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[44:45], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s7 +; VI-NEXT: s_lshr_b32 s43, s5, 16 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s7, s5, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s5 +; VI-NEXT: s_lshr_b64 s[56:57], s[42:43], 16 +; VI-NEXT: s_addk_i32 s7, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s42, s5, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v15, s4, v15 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: s_branch .LBB109_5 -; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_bfe_u32 s5, s4, 0x10010 +; VI-NEXT: s_add_i32 s5, s5, s4 +; VI-NEXT: s_add_i32 s7, s5, 0x7fff +; VI-NEXT: s_or_b32 s9, s4, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cselect_b32 s4, s9, s7 +; VI-NEXT: s_lshr_b32 s43, s4, 16 +; VI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 +; VI-NEXT: s_mov_b32 s17, s6 +; VI-NEXT: s_mov_b32 s19, s8 +; VI-NEXT: s_mov_b32 s21, s10 +; VI-NEXT: s_mov_b32 s23, s12 +; VI-NEXT: s_mov_b32 s25, s14 +; VI-NEXT: s_mov_b32 s27, s40 +; VI-NEXT: s_mov_b32 s29, s46 +; VI-NEXT: s_mov_b32 s5, s56 +; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 +; VI-NEXT: s_lshr_b32 s7, s56, 24 +; VI-NEXT: s_lshr_b32 s9, s56, 16 +; VI-NEXT: s_lshr_b32 s11, s56, 8 +; VI-NEXT: s_lshr_b32 s13, s4, 16 +; VI-NEXT: s_lshr_b32 s15, s4, 8 +; VI-NEXT: s_lshr_b32 s41, s46, 24 +; VI-NEXT: s_lshr_b32 s47, s46, 16 +; VI-NEXT: s_lshr_b32 s57, s46, 8 +; VI-NEXT: s_lshr_b32 s88, s28, 16 +; VI-NEXT: s_lshr_b32 s89, s28, 8 +; VI-NEXT: s_lshr_b32 s90, s40, 24 +; VI-NEXT: s_lshr_b32 s91, s40, 16 +; VI-NEXT: s_lshr_b32 s30, s40, 8 +; VI-NEXT: s_lshr_b32 s31, s26, 16 +; VI-NEXT: s_lshr_b32 s34, s26, 8 +; VI-NEXT: s_lshr_b32 s35, s14, 24 +; VI-NEXT: s_lshr_b32 s36, s14, 16 +; VI-NEXT: s_lshr_b32 s37, s14, 8 +; VI-NEXT: s_lshr_b32 s38, s24, 16 +; VI-NEXT: s_lshr_b32 s39, s24, 8 +; VI-NEXT: s_lshr_b32 s48, s12, 24 +; VI-NEXT: s_lshr_b32 s49, s12, 16 +; VI-NEXT: s_lshr_b32 s50, s12, 8 +; VI-NEXT: s_lshr_b32 s51, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s22, 8 +; VI-NEXT: s_lshr_b32 s53, s10, 24 +; VI-NEXT: s_lshr_b32 s54, s10, 16 +; VI-NEXT: s_lshr_b32 s55, s10, 8 +; VI-NEXT: s_lshr_b32 s64, s20, 16 +; VI-NEXT: s_lshr_b32 s65, s20, 8 +; VI-NEXT: s_lshr_b32 s66, s8, 24 +; VI-NEXT: s_lshr_b32 s67, s8, 16 +; VI-NEXT: s_lshr_b32 s68, s8, 8 +; VI-NEXT: s_lshr_b32 s69, s18, 16 +; VI-NEXT: s_lshr_b32 s70, s18, 8 +; VI-NEXT: s_lshr_b32 s71, s6, 24 +; VI-NEXT: s_lshr_b32 s80, s6, 16 +; VI-NEXT: s_lshr_b32 s81, s6, 8 +; VI-NEXT: s_lshr_b32 s82, s16, 16 +; VI-NEXT: s_lshr_b32 s83, s16, 8 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: .LBB109_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s16, s83, 8 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_lshl_b32 s16, s76, 8 +; VI-NEXT: s_and_b32 s17, s82, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s6, 0xff +; VI-NEXT: s_lshl_b32 s6, s81, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s80, 0xff +; VI-NEXT: s_lshl_b32 s16, s71, 8 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s70, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s69, 0xff +; VI-NEXT: s_lshl_b32 s16, s74, 8 +; VI-NEXT: s_or_b32 s6, s6, s16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s8, 0xff +; VI-NEXT: s_lshl_b32 s6, s68, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s67, 0xff +; VI-NEXT: s_lshl_b32 s8, s66, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s65, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s64, 0xff +; VI-NEXT: s_lshl_b32 s8, s72, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s10, 0xff +; VI-NEXT: s_lshl_b32 s6, s55, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s54, 0xff +; VI-NEXT: s_lshl_b32 s8, s53, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s6, s52, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s51, 0xff +; VI-NEXT: s_lshl_b32 s8, s62, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s12, 0xff +; VI-NEXT: s_lshl_b32 s6, s50, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s49, 0xff +; VI-NEXT: s_lshl_b32 s8, s48, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s39, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s38, 0xff +; VI-NEXT: s_lshl_b32 s8, s60, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s14, 0xff +; VI-NEXT: s_lshl_b32 s6, s37, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s36, 0xff +; VI-NEXT: s_lshl_b32 s8, s35, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s34, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s31, 0xff +; VI-NEXT: s_lshl_b32 s8, s58, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s6, s30, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s91, 0xff +; VI-NEXT: s_lshl_b32 s8, s90, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s6, s89, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s88, 0xff +; VI-NEXT: s_lshl_b32 s8, s44, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s46, 0xff +; VI-NEXT: s_lshl_b32 s6, s57, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s5, s15, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s13, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s56, 0xff +; VI-NEXT: s_lshl_b32 s5, s11, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s9, 0xff +; VI-NEXT: s_lshl_b32 s6, s7, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s30, v4, 26 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s31, v4, 27 +; VI-NEXT: v_readlane_b32 s83, v4, 25 +; VI-NEXT: v_readlane_b32 s82, v4, 24 +; VI-NEXT: v_readlane_b32 s81, v4, 23 +; VI-NEXT: v_readlane_b32 s80, v4, 22 +; VI-NEXT: v_readlane_b32 s71, v4, 21 +; VI-NEXT: v_readlane_b32 s70, v4, 20 +; VI-NEXT: v_readlane_b32 s69, v4, 19 +; VI-NEXT: v_readlane_b32 s68, v4, 18 +; VI-NEXT: v_readlane_b32 s67, v4, 17 +; VI-NEXT: v_readlane_b32 s66, v4, 16 +; VI-NEXT: v_readlane_b32 s65, v4, 15 +; VI-NEXT: v_readlane_b32 s64, v4, 14 +; VI-NEXT: v_readlane_b32 s55, v4, 13 +; VI-NEXT: v_readlane_b32 s54, v4, 12 +; VI-NEXT: v_readlane_b32 s53, v4, 11 +; VI-NEXT: v_readlane_b32 s52, v4, 10 +; VI-NEXT: v_readlane_b32 s51, v4, 9 +; VI-NEXT: v_readlane_b32 s50, v4, 8 +; VI-NEXT: v_readlane_b32 s49, v4, 7 +; VI-NEXT: v_readlane_b32 s48, v4, 6 +; VI-NEXT: v_readlane_b32 s39, v4, 5 +; VI-NEXT: v_readlane_b32 s38, v4, 4 +; VI-NEXT: v_readlane_b32 s37, v4, 3 +; VI-NEXT: v_readlane_b32 s36, v4, 2 +; VI-NEXT: v_readlane_b32 s35, v4, 1 +; VI-NEXT: v_readlane_b32 s34, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_4: +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr49 -; VI-NEXT: ; implicit-def: $sgpr39 -; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr39 ; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 -; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr61 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB109_2 -; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s42 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_mov_b32_e32 v3, s18 -; VI-NEXT: v_mov_b32_e32 v4, s19 -; VI-NEXT: v_mov_b32_e32 v5, s20 -; VI-NEXT: v_mov_b32_e32 v6, s21 -; VI-NEXT: v_mov_b32_e32 v7, s22 -; VI-NEXT: v_mov_b32_e32 v8, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v11, s26 -; VI-NEXT: v_mov_b32_e32 v12, s27 -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: v_mov_b32_e32 v16, s5 -; VI-NEXT: v_mov_b32_e32 v18, s67 -; VI-NEXT: v_mov_b32_e32 v62, s65 -; VI-NEXT: v_mov_b32_e32 v17, s66 -; VI-NEXT: v_mov_b32_e32 v60, s64 -; VI-NEXT: v_mov_b32_e32 v61, s55 -; VI-NEXT: v_mov_b32_e32 v58, s54 -; VI-NEXT: v_mov_b32_e32 v59, s52 -; VI-NEXT: v_mov_b32_e32 v57, s53 -; VI-NEXT: v_mov_b32_e32 v47, s51 -; VI-NEXT: v_mov_b32_e32 v56, s50 -; VI-NEXT: v_mov_b32_e32 v46, s49 -; VI-NEXT: v_mov_b32_e32 v45, s39 -; VI-NEXT: v_mov_b32_e32 v44, s48 -; VI-NEXT: v_mov_b32_e32 v42, s38 -; VI-NEXT: v_mov_b32_e32 v43, s37 -; VI-NEXT: v_mov_b32_e32 v41, s36 -; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: v_mov_b32_e32 v55, s35 -; VI-NEXT: v_mov_b32_e32 v53, s31 -; VI-NEXT: v_mov_b32_e32 v54, s30 -; VI-NEXT: v_mov_b32_e32 v52, s91 -; VI-NEXT: v_mov_b32_e32 v51, s89 -; VI-NEXT: v_mov_b32_e32 v50, s90 -; VI-NEXT: v_mov_b32_e32 v48, s88 -; VI-NEXT: v_mov_b32_e32 v49, s79 -; VI-NEXT: v_mov_b32_e32 v39, s78 -; VI-NEXT: v_mov_b32_e32 v38, s76 -; VI-NEXT: v_mov_b32_e32 v37, s77 -; VI-NEXT: v_mov_b32_e32 v35, s75 -; VI-NEXT: v_mov_b32_e32 v36, s74 -; VI-NEXT: v_mov_b32_e32 v34, s73 -; VI-NEXT: v_mov_b32_e32 v33, s63 -; VI-NEXT: v_mov_b32_e32 v32, s72 -; VI-NEXT: v_mov_b32_e32 v30, s62 -; VI-NEXT: v_mov_b32_e32 v31, s61 -; VI-NEXT: v_mov_b32_e32 v29, s60 -; VI-NEXT: v_mov_b32_e32 v28, s58 -; VI-NEXT: v_mov_b32_e32 v27, s59 -; VI-NEXT: v_mov_b32_e32 v25, s57 -; VI-NEXT: v_mov_b32_e32 v26, s56 -; VI-NEXT: v_mov_b32_e32 v21, s12 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v23, s8 -; VI-NEXT: v_mov_b32_e32 v24, s6 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s40 -; VI-NEXT: v_mov_b32_e32 v20, s14 -; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_readlane_b32 s30, v63, 18 -; VI-NEXT: v_readlane_b32 s31, v63, 19 -; VI-NEXT: v_readlane_b32 s67, v63, 17 -; VI-NEXT: v_readlane_b32 s66, v63, 16 -; VI-NEXT: v_readlane_b32 s65, v63, 15 -; VI-NEXT: v_readlane_b32 s64, v63, 14 -; VI-NEXT: v_readlane_b32 s55, v63, 13 -; VI-NEXT: v_readlane_b32 s54, v63, 12 -; VI-NEXT: v_readlane_b32 s53, v63, 11 -; VI-NEXT: v_readlane_b32 s52, v63, 10 -; VI-NEXT: v_readlane_b32 s51, v63, 9 -; VI-NEXT: v_readlane_b32 s50, v63, 8 -; VI-NEXT: v_readlane_b32 s49, v63, 7 -; VI-NEXT: v_readlane_b32 s48, v63, 6 -; VI-NEXT: v_readlane_b32 s39, v63, 5 -; VI-NEXT: v_readlane_b32 s38, v63, 4 -; VI-NEXT: v_readlane_b32 s37, v63, 3 -; VI-NEXT: v_readlane_b32 s36, v63, 2 -; VI-NEXT: v_readlane_b32 s35, v63, 1 -; VI-NEXT: v_readlane_b32 s34, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 14e17ce49cca0..3d9c7681b3132 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -2272,30 +2272,30 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -2311,42 +2311,43 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -5460,30 +5461,30 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -5499,42 +5500,43 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -8361,30 +8363,30 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -8400,42 +8402,43 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -10937,30 +10940,30 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshr_b64 v[0:1], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -10976,42 +10979,43 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v4 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v4 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -13150,39 +13154,40 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_lshr_b64 v[5:6], v[1:2], 16 ; SI-NEXT: .LBB95_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_mov_b32_e32 v1, v5 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v4bf16_to_v4i16_scalar: @@ -13194,42 +13199,43 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -15062,42 +15068,43 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -16737,52 +16744,52 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v9, v1, v12, 16 -; SI-NEXT: v_alignbit_b32 v10, v6, v8, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; SI-NEXT: v_lshr_b64 v[3:4], v[11:12], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[11:12], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v12 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v9, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[11:12], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_alignbit_b32 v10, v6, v1, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; SI-NEXT: v_lshr_b64 v[12:13], v[5:6], 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshr_b64 v[3:4], v[11:12], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[11:12], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v12 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v9 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v10 +; SI-NEXT: v_mov_b32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v4, v12 +; SI-NEXT: v_mov_b32_e32 v5, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB109_2 ; @@ -16793,11 +16800,11 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 -; VI-NEXT: s_lshr_b32 s8, s17, 24 -; VI-NEXT: s_lshr_b32 s5, s17, 16 -; VI-NEXT: s_lshr_b32 s10, s17, 8 -; VI-NEXT: s_lshr_b32 s9, s16, 16 -; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s8, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s9, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 @@ -16810,58 +16817,59 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[5:6] +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v6, v3, 16 -; VI-NEXT: v_alignbit_b32 v1, v0, v4, 16 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v10, v1 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 ; VI-NEXT: v_mov_b32_e32 v4, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr9 -; VI-NEXT: ; implicit-def: $sgpr4 ; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr4 ; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_mov_b32_e32 v2, s9 -; VI-NEXT: v_mov_b32_e32 v5, s10 -; VI-NEXT: v_mov_b32_e32 v7, s8 -; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v4, s17 ; VI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 6ada0cb8c46f1..ab629e1a4d269 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -2214,40 +2214,40 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshr_b64 v[0:1], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshr_b64 v[2:3], v[4:5], 16 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: .LBB11_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: @@ -2263,60 +2263,61 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB11_4 ; VI-NEXT: .LBB11_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB11_3: ; VI-NEXT: s_branch .LBB11_2 @@ -5430,40 +5431,40 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshr_b64 v[0:1], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_lshr_b64 v[1:2], v[6:7], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshr_b64 v[2:3], v[4:5], 16 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: .LBB27_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: @@ -5479,60 +5480,61 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB27_4 ; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v5 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v5 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_3: ; VI-NEXT: s_branch .LBB27_2 @@ -8098,70 +8100,71 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_alignbit_b32 v12, v1, v18, 16 -; SI-NEXT: v_alignbit_b32 v13, v6, v16, 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 -; SI-NEXT: v_alignbit_b32 v8, v10, v14, 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshr_b64 v[17:18], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v13 +; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; SI-NEXT: v_alignbit_b32 v13, v6, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 -; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 -; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_lshr_b64 v[17:18], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 +; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v5 ; SI-NEXT: .LBB39_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v12 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v4, v17 +; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v8, v13 +; SI-NEXT: v_mov_b32_e32 v9, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_branch .LBB39_2 ; @@ -8171,110 +8174,110 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cmp_lg_u32 s19, 0 ; VI-NEXT: s_cbranch_scc0 .LBB39_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s19, s16, 8 -; VI-NEXT: s_lshr_b32 s10, s18, 16 -; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s19, s18, 16 +; VI-NEXT: s_lshr_b32 s15, s18, 8 ; VI-NEXT: s_lshr_b32 s12, s17, 24 -; VI-NEXT: s_lshr_b32 s13, s17, 16 -; VI-NEXT: s_lshr_b32 s15, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 ; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b32 s13, s16, 8 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB39_4 ; VI-NEXT: .LBB39_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1] +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: v_mov_b32_e32 v9, 0x7fc07fc0 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[0:1] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_branch .LBB39_5 ; VI-NEXT: .LBB39_3: -; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr13 ; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr11 ; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr19 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: s_branch .LBB39_2 ; VI-NEXT: .LBB39_4: -; VI-NEXT: v_mov_b32_e32 v14, s16 -; VI-NEXT: v_mov_b32_e32 v15, s17 ; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v13, s15 ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v7, s12 -; VI-NEXT: v_mov_b32_e32 v13, s11 -; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v5, s10 ; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v14, s4 ; VI-NEXT: .LBB39_5: ; %end -; VI-NEXT: v_mov_b32_e32 v0, v14 -; VI-NEXT: v_mov_b32_e32 v4, v15 +; VI-NEXT: v_mov_b32_e32 v3, v14 ; VI-NEXT: v_mov_b32_e32 v9, v13 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11854,33 +11857,32 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: s_cbranch_execnz .LBB49_4 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -11889,25 +11891,27 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 ; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB49_3: ; VI-NEXT: s_branch .LBB49_2 @@ -12813,50 +12817,52 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16 -; SI-NEXT: v_alignbit_b32 v4, v5, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_lshr_b64 v[7:8], v[1:2], 16 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB53_2 ; @@ -12869,33 +12875,32 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cbranch_execnz .LBB53_4 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -12904,25 +12909,27 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_add_f32_e32 v2, s4, v3 ; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_bfe_u32 v6, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB53_3: ; VI-NEXT: s_branch .LBB53_2 diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll index 18cf120a1d299..61645200690f5 100644 --- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -27,24 +27,26 @@ define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace( ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3 -; GFX6-NEXT: s_lshr_b32 s8, s9, 16 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_lshr_b32 s7, s9, 16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[6:7], s[4:5], 8 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:9 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 8 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:9 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 -; GFX6-NEXT: v_alignbit_b32 v0, s8, v0, 16 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:12 +; GFX6-NEXT: s_lshr_b32 s5, s4, 8 +; GFX6-NEXT: s_lshr_b32 s4, s4, 24 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:12 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:7 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 6a95881067b93..ff74d1f71616d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2147,12 +2147,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -2190,12 +2190,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s5, v2, v[0:1] +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_uniform: @@ -6208,10 +6208,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s8, v4, 0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[3:4] ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s7, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 08a4f0cdad18f..f5ca24f59a286 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1889,13 +1889,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1164-NEXT: v_mov_b32_e32 v0, v4 +; GFX1164-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: @@ -1926,13 +1926,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1132-NEXT: v_mov_b32_e32 v0, v4 +; GFX1132-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[0:1] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: buffer_store_b64 v[3:4], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel @@ -5182,13 +5182,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], s2, v2, 0 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s8, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index a9026d22a9686..5bec8edc37476 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -46071,18 +46071,18 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v3bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s3 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 +; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -46093,13 +46093,13 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX7-LABEL: s_select_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 +; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -46209,22 +46209,22 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7 -; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s7 +; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16 +; GCN-NEXT: v_lshr_b64 v[3:4], v[5:6], 16 +; GCN-NEXT: v_lshr_b64 v[4:5], v[7:8], 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -46235,21 +46235,21 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX7-LABEL: s_select_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GFX7-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s6 +; GFX7-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir index 8a39d9c517b50..34c0159dd3ddb 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir @@ -68,7 +68,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -149,7 +149,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir index a2f02052cbf36..4cf92b0127131 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir @@ -69,7 +69,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -151,7 +151,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 1835018 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 142290a39f8f4..361bc78759bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2382,17 +2382,17 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; SDAG-NEXT: v_lshrrev_b32_e32 v14, 8, v0 ; SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] ; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v14 ; SDAG-NEXT: v_mov_b32_e32 v2, v13 ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index fbaaef0b29b66..37f4094806637 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -186,10 +186,12 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_alignbit_b32 v0, 5, s6, 16 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index d7d697ef85b9f..00baf0a44368d 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -1026,102 +1026,100 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) { ; GFX9-NEXT: v_or_b32_e32 v5, v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v11, vcc, 0, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v0, vcc -; GFX9-NEXT: v_madmk_f32 v1, v3, 0x4f800000, v1 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_madmk_f32 v1, v3, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v10 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 -; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v9, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v10, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v13 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 -; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0 -; GFX9-NEXT: v_mul_hi_u32 v12, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v12, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v11, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v0 +; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v0, vcc +; GFX9-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mul_hi_u32 v12, v9, v4 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v8, v7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v8, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v10, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v0, v1 -; GFX9-NEXT: v_mul_lo_u32 v9, v6, v5 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0 -; GFX9-NEXT: v_add3_u32 v4, v4, v9, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, v2, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v7, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v3, v6 -; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 +; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, v3, v7 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v6, 0 +; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, v2, v5 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v8, v0, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v1, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v8, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v9, s[4:5] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: .LBB10_2: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir index 03f1018c40b21..9e1444d9213e7 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir +++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir @@ -20,13 +20,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -45,13 +45,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -72,7 +72,7 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY3]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -80,7 +80,7 @@ body: | undef %3.sub0:areg_96 = COPY %0 %3.sub1:areg_96 = COPY %1 %3.sub2:areg_96 = COPY %2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -101,7 +101,7 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY3]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -109,7 +109,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0 %3.sub1:areg_96_align2 = COPY %1 %3.sub2:areg_96_align2 = COPY %2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -128,13 +128,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0 %2.sub2_sub3:areg_128 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -153,13 +153,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0 %2.sub2_sub3:areg_128_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -178,13 +178,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:sgpr_32 = COPY $sgpr8 %1:sgpr_32 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -203,13 +203,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0 %2.sub1_sub2:areg_96 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -228,13 +228,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0 %2.sub1_sub2:areg_96_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -253,13 +253,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0 %2.sub2:areg_96 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -278,13 +278,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0 %2.sub2:areg_96_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -302,12 +302,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -326,13 +326,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0 %2.sub1:areg_64_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -350,12 +350,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0 %1.sub1:areg_96 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %1 SI_RETURN ... @@ -373,12 +373,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0 %1.sub1:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %1 SI_RETURN ... @@ -398,14 +398,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0 %1.sub1:areg_128 = COPY %0 %1.sub2:areg_128 = COPY %0 %1.sub3:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -425,14 +425,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0 %1.sub1:areg_128_align2 = COPY %0 %1.sub2:areg_128_align2 = COPY %0 %1.sub3:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -451,15 +451,15 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 - INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, killed %0 SI_RETURN ... @@ -477,14 +477,14 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_64 = COPY %0 %1.sub1:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1 - INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 1245193 /* reguse:VGPR_32 */, killed %0 SI_RETURN ... @@ -503,16 +503,16 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2818057 /* reguse:VReg_64 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_64 = COPY %0 %1.sub1:areg_64 = COPY %0 undef %2.sub0:vreg_64 = COPY %0 %2.sub1:vreg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 2818057 /* reguse:VReg_64 */, killed %2 SI_RETURN ... @@ -533,13 +533,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -558,13 +558,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -585,7 +585,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -593,7 +593,7 @@ body: | undef %3.sub0:areg_96 = COPY %0.sub0 %3.sub1:areg_96 = COPY %0.sub1 %3.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -614,7 +614,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -622,7 +622,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0.sub0 %3.sub1:areg_96_align2 = COPY %0.sub1 %3.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -641,13 +641,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -668,13 +668,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1 %0.sub1:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0 %2.sub2_sub3:areg_128_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -693,13 +693,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:sreg_64 = COPY $sgpr8 %0.sub1:sreg_64 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -718,13 +718,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub0 %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -743,13 +743,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0.sub0 %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -768,13 +768,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1 %2.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -793,13 +793,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -817,12 +817,12 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -841,13 +841,13 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0.sub0 %1.sub1:areg_96 = COPY %0.sub0 %1.sub2:areg_96 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %1 SI_RETURN ... @@ -865,12 +865,12 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0.sub0 %1.sub1:areg_96_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %1 SI_RETURN ... @@ -890,14 +890,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0.sub0 %1.sub1:areg_128 = COPY %0.sub0 %1.sub2:areg_128 = COPY %0.sub0 %1.sub3:areg_128 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -917,14 +917,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0.sub0 %1.sub1:areg_128_align2 = COPY %0.sub0 %1.sub2:areg_128_align2 = COPY %0.sub0 %1.sub3:areg_128_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -943,13 +943,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 %0.sub1:vreg_64 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0.sub0 %2.sub1:areg_64 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -968,13 +968,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64_align2 = COPY $vgpr0 %0.sub1:vreg_64_align2 = COPY $vgpr1 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -995,7 +995,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -1003,7 +1003,7 @@ body: | undef %3.sub0:areg_96 = COPY %0.sub0 %3.sub1:areg_96 = COPY %0.sub1 %3.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -1024,7 +1024,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96_align2 = COPY $vgpr0 %0.sub1:vreg_96_align2 = COPY $vgpr1 @@ -1032,7 +1032,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0.sub0 %3.sub1:areg_96_align2 = COPY %0.sub1 %3.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -1051,13 +1051,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1076,13 +1076,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1101,13 +1101,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:sreg_64 = COPY $sgpr8 %0.sub1:sreg_64 = COPY $sgpr9 undef %2.sub0:areg_64_align2 = COPY %0.sub0 %2.sub1:areg_64_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1126,13 +1126,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub0 %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1150,13 +1150,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub2 %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1176,13 +1176,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96_align2 = COPY $vgpr0 %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96_align2 = COPY %0.sub0 %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1201,13 +1201,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1 %2.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1226,13 +1226,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96_align2 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1251,13 +1251,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1 %2.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1274,11 +1274,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %2:areg_64 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:AReg_64 */, killed %2 SI_RETURN ... @@ -1295,11 +1295,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %2:areg_64_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1316,11 +1316,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 %3:areg_96 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5505033 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -1337,11 +1337,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2 %3:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -1358,11 +1358,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1379,11 +1379,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1400,11 +1400,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:sreg_64 = COPY $sgpr8_sgpr9 %2:areg_64_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... @@ -1421,11 +1421,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 %2:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir index 126cbc643accf..856d1e66fee9d 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir @@ -20,10 +20,10 @@ body: | ; CHECK-LABEL: name: foo1 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %0:vgpr_32, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -41,10 +41,10 @@ body: | ; CHECK-LABEL: name: foo2 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -62,10 +62,10 @@ body: | ; CHECK-LABEL: name: foo3 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %1:vgpr_32, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -83,10 +83,10 @@ body: | ; CHECK-LABEL: name: foo4 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1245194 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 1245195 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll index 613fdf388c0f1..0f45e99dd76c4 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll @@ -64,13 +64,11 @@ define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, ptr %arg2) { ; CHECK-LABEL: mullohi_2xu32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v1, 0 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 -; CHECK-NEXT: v_mov_b32_e32 v7, v3 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 -; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; CHECK-NEXT: v_mov_b32_e32 v2, v7 +; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v1, v6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 54cbc25043db3..e841ec43fd064 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -193,14 +193,13 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 3, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 3, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3] +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, 1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 3, 0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 3, v[3:4] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: urem64_3: @@ -238,14 +237,13 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] ; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] -; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 -; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 3, v[2:3] -; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3] +; GFX1030-NEXT: v_alignbit_b32 v2, v4, v3, 1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4] +; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %rem = urem i64 %i, 3 @@ -265,14 +263,13 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] -; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 6, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 6, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, s6, v[2:3] +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, 2 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 6, 0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 6, v[3:4] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: urem64_6: @@ -310,14 +307,13 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] ; GFX1030-NEXT: v_add_co_u32 v2, s4, v5, v3 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] -; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 6, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 -; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 6, v[2:3] -; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0xaaaaaaaa, v1, v[2:3] +; GFX1030-NEXT: v_alignbit_b32 v2, v4, v3, 2 +; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 6, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 6, v[3:4] +; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: %rem = urem i64 %i, 6 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 26f77898faf60..ddac86b3719c2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -1953,68 +1953,66 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0 -; SDAG-NEXT: v_mul_lo_u32 v24, v27, v2 -; SDAG-NEXT: v_mul_lo_u32 v25, v34, v31 -; SDAG-NEXT: v_mul_lo_u32 v34, v32, v30 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0 -; SDAG-NEXT: v_mov_b32_e32 v15, 0 -; SDAG-NEXT: v_mul_lo_u32 v38, v12, v7 -; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v6, 0 -; SDAG-NEXT: v_mul_lo_u32 v39, v13, v6 +; SDAG-NEXT: v_mul_lo_u32 v15, v27, v2 +; SDAG-NEXT: v_mul_lo_u32 v23, v34, v31 +; SDAG-NEXT: v_mul_lo_u32 v24, v32, v30 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v31, v33, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mul_lo_u32 v25, v12, v7 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v13, v6 ; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 -; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_mul_lo_u32 v38, v18, v36 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; SDAG-NEXT: v_mov_b32_e32 v14, v3 -; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v2 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v38 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v24 -; SDAG-NEXT: v_mov_b32_e32 v14, v22 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[14:15] -; SDAG-NEXT: v_xor_b32_e32 v24, v16, v28 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v21, v39 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v33, v[21:22] +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v20 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v25 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 +; SDAG-NEXT: v_mov_b32_e32 v21, v6 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v31, v27, v[21:22] +; SDAG-NEXT: v_xor_b32_e32 v16, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v34 ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v3 -; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v17, v2, vcc -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] -; SDAG-NEXT: v_mov_b32_e32 v14, v7 -; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v36, v12, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v25, v11 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v30, v27, v[22:23] -; SDAG-NEXT: v_xor_b32_e32 v18, v31, v29 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v7, v15 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v17, v14, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v37, v12, 0 +; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v23, v11 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v27, v[6:7] +; SDAG-NEXT: v_xor_b32_e32 v17, v14, v29 ; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 -; SDAG-NEXT: v_mov_b32_e32 v14, v16 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], v34, v7 -; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v36, v13, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v0, v28 -; SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v2 -; SDAG-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v24, v28 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v29, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v7, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v14, vcc -; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc -; SDAG-NEXT: v_xor_b32_e32 v7, v7, v35 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v36, v12, v[21:22] +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v24, v15 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v38, v3 +; SDAG-NEXT: v_mov_b32_e32 v21, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[21:22] +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 +; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v18, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v12, v15 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, v0, v28 +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v36, v13, v[6:7] +; SDAG-NEXT: v_xor_b32_e32 v11, v1, v29 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v16, v28 +; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v2 +; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v17, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v10, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v11, v29, vcc +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v20 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v14, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, v8, v26 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v9, v35 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc +; SDAG-NEXT: v_xor_b32_e32 v7, v4, v26 ; SDAG-NEXT: v_xor_b32_e32 v9, v5, v35 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v35, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v8, v26, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v8, v26 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v6, v35, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v7, v26, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v35, vcc ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -2407,44 +2405,41 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19 -; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18 -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 -; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3 -; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] -; GISEL-NEXT: v_mov_b32_e32 v22, v19 -; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3] -; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15] -; GISEL-NEXT: v_mov_b32_e32 v2, v23 +; GISEL-NEXT: v_mul_lo_u32 v26, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v27, v29, v18 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v20, 0 +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v2, 0 +; GISEL-NEXT: v_mul_lo_u32 v36, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v37, v34, v2 +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v29, v32, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[24:25] +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v4, v20, v[14:15] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2] -; GISEL-NEXT: v_mov_b32_e32 v23, v25 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15] -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[23:24] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v29, v31, v[14:15] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v26, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2] -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v25, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v27, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc -; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v18, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v37, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v22 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33 +; GISEL-NEXT: v_xor_b32_e32 v22, v2, v33 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1] ; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33 ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28 +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v18, v28 ; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33 +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v22, v33 ; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9] -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v19, vcc ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5] @@ -2815,52 +2810,50 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v32, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 -; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 -; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 -; SDAG-NEXT: v_mul_lo_u32 v34, v16, v15 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v35, v17, v14 -; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 -; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v16, 0 -; SDAG-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; SDAG-NEXT: v_mov_b32_e32 v20, v11 -; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[20:21] -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v34 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v19, v28 -; SDAG-NEXT: v_mov_b32_e32 v20, v26 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v25, v35 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v31, v8, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v26, s[4:5], v27, v11 -; SDAG-NEXT: v_addc_u32_e64 v27, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v13, v16, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v29, v19 -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[26:27] -; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], v23, v11 -; SDAG-NEXT: v_mov_b32_e32 v20, v15 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[20:21] -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v33, v19 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v36, v22 -; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v16, v12 -; SDAG-NEXT: v_addc_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v18 -; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v19, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v21, v32, v11 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v32, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v26, v30, v10 +; SDAG-NEXT: v_mul_lo_u32 v27, v33, v8 +; SDAG-NEXT: v_mul_lo_u32 v28, v31, v9 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v32, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mul_lo_u32 v29, v16, v15 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v33, v17, v14 +; SDAG-NEXT: v_mul_lo_u32 v34, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v35, v22, v13 +; SDAG-NEXT: v_add_i32_e32 v21, vcc, v25, v21 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v9, v32, v[19:20] +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v18 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v29 +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v21, v26 +; SDAG-NEXT: v_mov_b32_e32 v19, v14 +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v30, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v33 +; SDAG-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v31, v8, v[24:25] +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v15, v19 +; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v18, vcc +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[10:11] +; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v16, 0 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v27, v24 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[14:15] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v34, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v13, v16, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v28, v21 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v35, v11 +; SDAG-NEXT: v_mov_b32_e32 v19, v14 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[15:16] -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v15, v12 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[8:9] ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v21, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v18 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc @@ -3223,18 +3216,16 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 ; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19 ; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18 -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] -; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19] -; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23] -; GISEL-NEXT: v_mov_b32_e32 v18, v26 -; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18] -; GISEL-NEXT: v_mov_b32_e32 v22, v28 +; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v9, v33, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[30:31], s[4:5], v13, v25, v[26:27] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[28:29] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[30:31] +; GISEL-NEXT: v_mad_u64_u32 v[26:27], vcc, v8, v33, v[17:18] ; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22] -; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31] -; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[26:27] +; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v19, v34, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18] -; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v23, v36, s[6:7] ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 08545b901581c..8532a7f716ba7 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -487,13 +487,13 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir index 4dfdb56a69ff3..98472552d2bf1 100644 --- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir @@ -370,7 +370,7 @@ body: | ; HAZARD-LABEL: name: inline_sdwa_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_NOP 0 ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_ENDPGM 0 @@ -378,10 +378,10 @@ body: | ; NOHAZARD-LABEL: name: inline_sdwa_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; NOHAZARD-NEXT: S_ENDPGM 0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) S_ENDPGM 0 ... @@ -397,17 +397,17 @@ body: | ; HAZARD-NEXT: {{ $}} ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: sdwa_inline_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: S_ENDPGM 0 renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0) - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 S_ENDPGM 0 ... @@ -421,19 +421,19 @@ body: | ; HAZARD-LABEL: name: inline_inline_hazard ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; HAZARD-NEXT: {{ $}} - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_NOP 0 - ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; HAZARD-NEXT: S_ENDPGM 0 ; ; NOHAZARD-LABEL: name: inline_inline_hazard ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode ; NOHAZARD-NEXT: {{ $}} - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 - ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 + ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 ; NOHAZARD-NEXT: S_ENDPGM 0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0, 1245193 /* reguse:VGPR_32 */, $vgpr1 S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 30bcdf97e26fd..4ff8bf23638f1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -5023,20 +5023,20 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in ; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s0, s0 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s3, s3, s6 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x10010 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s4, s1, 0x400000 -; GFX8-NEXT: s_add_i32 s5, s0, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s1, s1 -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX8-NEXT: s_cselect_b32 s0, s4, s5 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff +; GFX8-NEXT: s_cselect_b32 s0, s3, s6 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010 +; GFX8-NEXT: s_add_i32 s4, s4, s1 +; GFX8-NEXT: s_or_b32 s3, s1, 0x400000 +; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s1, s3, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -5185,29 +5185,29 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i ; GFX8-NEXT: s_addk_i32 s8, 0x7fff ; GFX8-NEXT: s_bitset1_b32 s5, 22 ; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: v_readfirstlane_b32 s8, v3 -; GFX8-NEXT: s_bitcmp1_b32 s8, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_and_b64 s[6:7], s[12:13], exec +; GFX8-NEXT: s_cselect_b32 s6, s5, s8 +; GFX8-NEXT: v_readfirstlane_b32 s5, v3 +; GFX8-NEXT: s_bitcmp1_b32 s5, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_and_b64 s[8:9], s[12:13], exec ; GFX8-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], s[2:3] -; GFX8-NEXT: s_cselect_b32 s6, 1, -1 -; GFX8-NEXT: s_add_i32 s6, s8, s6 +; GFX8-NEXT: s_cselect_b32 s7, 1, -1 +; GFX8-NEXT: s_add_i32 s7, s5, s7 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX8-NEXT: s_cselect_b32 s0, s8, s6 +; GFX8-NEXT: s_cselect_b32 s0, s5, s7 ; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010 ; GFX8-NEXT: s_add_i32 s1, s1, s0 -; GFX8-NEXT: s_add_i32 s6, s1, 0x7fff +; GFX8-NEXT: s_add_i32 s5, s1, 0x7fff ; GFX8-NEXT: s_or_b32 s7, s0, 0x400000 ; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], exec -; GFX8-NEXT: s_cselect_b32 s0, s7, s6 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff +; GFX8-NEXT: s_cselect_b32 s0, s7, s5 +; GFX8-NEXT: s_lshr_b32 s7, s0, 16 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -5421,19 +5421,19 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i ; GFX8-NEXT: s_addk_i32 s3, 0x7fff ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010 -; GFX8-NEXT: s_add_i32 s3, s3, s2 -; GFX8-NEXT: s_addk_i32 s3, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s2, s2 -; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s2, v0, 16 +; GFX8-NEXT: s_cselect_b32 s4, s1, s3 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x10010 +; GFX8-NEXT: s_add_i32 s1, s1, s2 +; GFX8-NEXT: s_addk_i32 s1, 0x7fff +; GFX8-NEXT: v_cmp_u_f32_e64 s[6:7], s2, s2 +; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX8-NEXT: s_cselect_b32 s1, s2, s1 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[4:5], 16 ; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 10c60dfc9b34c..5424ebfcffcd1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -409,7 +409,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; CI-NEXT: v_add_f32_e64 v1, s2, 2.0 ; CI-NEXT: v_add_f32_e64 v0, s3, 1.0 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -441,7 +441,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -709,16 +709,16 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0x7fff0000 -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, s3, -4.0 -; CI-NEXT: v_mul_f32_e64 v1, s2, -4.0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_and_b32 s3, s2, 0x7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000 +; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; CI-NEXT: s_lshl_b32 s2, s3, 16 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fold_user_fneg_fabs_v2bf16: @@ -749,10 +749,10 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fold_user_fneg_fabs_v2bf16: @@ -956,17 +956,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: s_and_b32 s1, s4, 0x7fff +; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000 +; CI-NEXT: v_mul_f32_e64 v4, s2, -4.0 ; CI-NEXT: s_lshl_b32 s1, s1, 16 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: v_mul_f32_e64 v4, s1, -4.0 -; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000 -; CI-NEXT: v_mul_f32_e64 v5, s1, -4.0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; CI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 ; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_store_dword v[0:1], v5 ; CI-NEXT: flat_store_dword v[2:3], v4 @@ -1000,10 +1000,10 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll index 84b904ff67151..63aadaacbeb3a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll @@ -627,18 +627,18 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: flat_load_dword v1, v[0:1] ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mul_f32_e64 v2, -v2, v2 -; CI-NEXT: v_mul_f32_e32 v3, v3, v4 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v1 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_mul_f32_e64 v4, -v1, v1 +; CI-NEXT: v_mul_f32_e32 v1, v2, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; CI-NEXT: v_lshr_b64 v[2:3], v[1:2], 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -648,34 +648,34 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: s_add_i32 s12, s12, s17 ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x8000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_xor_b32_sdwa v5, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mul_f32_e32 v3, v5, v4 -; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_xor_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f32_e32 v2, v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, v1, v5 +; GFX8-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 460f1211d1386..0c4a15f6a9d5e 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -61,34 +61,32 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v4, v1 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] +; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB0_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 @@ -119,10 +117,11 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup @@ -234,37 +233,36 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: .LBB0_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 @@ -276,16 +274,15 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 -; GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2] ; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] @@ -429,34 +426,32 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mov_b32_e32 v4, v1 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[4:5] -; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v4, v6 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v7, v5 +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v13, v10, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[1:2] +; SDAG-NEXT: v_add3_u32 v4, v4, v13, v12 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v3, v9, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB1_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 @@ -487,10 +482,11 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup @@ -602,37 +598,36 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v8, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v8, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr9 ; GISEL-NEXT: .LBB1_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 @@ -644,16 +639,15 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v9, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 -; GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v9, v[1:2] ; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v6, s[6:7] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] @@ -798,30 +792,30 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v9, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v11, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB2_4: ; %Flow @@ -836,9 +830,9 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -847,10 +841,11 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup @@ -961,37 +956,36 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB2_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 @@ -1004,11 +998,11 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_7: ; %Flow2 @@ -1152,30 +1146,30 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v9, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v11, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v11, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v10, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v9, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v11, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v9, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v11, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v11, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v9, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v10, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB3_4: ; %Flow @@ -1190,9 +1184,9 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v7, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, v[1:2] -; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v10, v3, v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -1201,10 +1195,11 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup @@ -1315,37 +1310,36 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_add_u32_e32 v3, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v6, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[6:7], v6, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v12, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v10, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[5:6] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v10, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[8:9], v12, v8, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v7, v9, v[5:6] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB3_4: ; %Flow ; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 @@ -1358,11 +1352,11 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 -; GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], vcc, v6, v8, v[1:2] ; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 ; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB3_7: ; %Flow2 @@ -1545,28 +1539,28 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v9, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v11, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB6_4: ; %Flow @@ -1590,10 +1584,11 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup @@ -1705,33 +1700,32 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB6_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffff3a, v5 +; GISEL-NEXT: v_sub_u32_e32 v3, 64, v2 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[3:4], v3, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], v5, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11] ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -1893,28 +1887,28 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[7:8] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, 0 -; SDAG-NEXT: v_mul_lo_u32 v4, v10, v2 -; SDAG-NEXT: v_mul_lo_u32 v14, v9, v3 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v9, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_add3_u32 v3, v3, v14, v4 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v10, v[5:6] -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 -; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v11, v12 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v1, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v10, v2 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v9, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v12, v9, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, v0 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v10, v[5:6] +; SDAG-NEXT: v_add3_u32 v3, v3, v12, v8 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v7, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v0, s[4:5], v1, v6 +; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v8, v11, v13 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v7 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, v[0:1] ; SDAG-NEXT: ; implicit-def: $vgpr9 -; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: v_add3_u32 v3, v7, v3, v8 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v0, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v1, v3, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr7_vgpr8 ; SDAG-NEXT: .LBB7_4: ; %Flow @@ -1938,10 +1932,11 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; SDAG-NEXT: v_mov_b32_e32 v3, v2 +; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup @@ -2053,33 +2048,32 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB7_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 -; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 -; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff7a, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[6:7] +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffff3a, v5 +; GISEL-NEXT: v_sub_u32_e32 v3, 64, v2 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_lshrrev_b64 v[3:4], v3, v[6:7] +; GISEL-NEXT: v_lshlrev_b64 v[5:6], v5, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] -; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v13, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v7, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v7, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[8:9], v3, v13, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v5, v9, v[3:4] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[10:11] ; GISEL-NEXT: ; implicit-def: $vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 68b95cd9adbf3..72c2003058a01 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -18,12 +18,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_lshr_b32 s1, s0, 1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_not_b32 s0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 +; SI-NEXT: s_mov_b32 s8, s1 +; SI-NEXT: s_mov_b32 s9, s0 +; SI-NEXT: s_lshr_b32 s3, s0, 1 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; SI-NEXT: s_not_b32 s2, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -32,14 +35,17 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_mov_b32 s6, s1 +; VI-NEXT: s_mov_b32 s7, s0 +; VI-NEXT: s_lshr_b32 s3, s0, 1 ; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: s_lshr_b32 s1, s0, 1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -49,12 +55,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_mov_b32 s4, s1 +; GFX9-NEXT: s_mov_b32 s5, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: s_lshr_b32 s1, s0, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -77,13 +86,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s0 +; GFX10-NEXT: s_lshr_b32 s3, s0, 1 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: @@ -91,14 +105,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_mov_b32 s6, s1 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_lshr_b32 s3, s0, 1 +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_and_b32 s2, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) @@ -113,10 +131,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 +; SI-NEXT: s_mov_b32 s0, s3 +; SI-NEXT: s_mov_b32 s1, s2 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 25 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -124,10 +144,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_mov_b32 s5, s2 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -136,8 +158,10 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -158,16 +182,22 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_mov_b32 s5, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], 25 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -185,41 +215,51 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; SI-NEXT: s_not_b32 s3, s5 -; SI-NEXT: s_lshr_b32 s1, s1, 1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_not_b32 s1, s4 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s0, 1 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_mov_b32 s7, s1 +; SI-NEXT: s_lshr_b32 s12, s1, 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; SI-NEXT: s_not_b32 s1, s5 +; SI-NEXT: s_mov_b32 s7, s12 +; SI-NEXT: s_and_b32 s1, s1, 31 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; SI-NEXT: s_lshr_b32 s5, s0, 1 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; SI-NEXT: s_not_b32 s2, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_and_b32 s2, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_not_b32 s7, s7 -; VI-NEXT: s_lshr_b32 s3, s1, 1 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_not_b32 s1, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: s_lshr_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_mov_b32 s8, s3 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_lshr_b32 s10, s1, 1 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 1 +; VI-NEXT: s_not_b32 s1, s5 +; VI-NEXT: s_mov_b32 s9, s10 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: s_lshr_b32 s5, s0, 1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; VI-NEXT: s_not_b32 s2, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -230,18 +270,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_lshr_b32 s10, s1, 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_not_b32 s1, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: s_mov_b32 s5, s10 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; GFX9-NEXT: s_not_b32 s2, s8 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -271,14 +316,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_not_b32 s2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_lshr_b32 s10, s1, 1 +; GFX10-NEXT: s_not_b32 s7, s7 +; GFX10-NEXT: s_lshr_b32 s11, s0, 1 +; GFX10-NEXT: s_not_b32 s6, s6 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_and_b32 s4, s7, 31 +; GFX10-NEXT: s_and_b32 s5, s6, 31 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s1, s10 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; @@ -288,16 +342,25 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_not_b32 s2, s7 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s3, s6 -; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX11-NEXT: s_mov_b32 s8, s3 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_lshr_b32 s10, s1, 1 +; GFX11-NEXT: s_not_b32 s7, s7 +; GFX11-NEXT: s_lshr_b32 s11, s0, 1 +; GFX11-NEXT: s_not_b32 s6, s6 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_and_b32 s7, s7, 31 +; GFX11-NEXT: s_and_b32 s6, s6, 31 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s1, s10 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -314,10 +377,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 23 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -326,11 +392,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; VI-NEXT: s_mov_b32 s6, s3 +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -341,10 +410,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 23 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -369,8 +441,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 23 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -379,10 +456,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 25 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 23 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -395,104 +477,134 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x15 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s5, s19 -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s11, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: s_not_b32 s5, s18 -; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s10, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: s_not_b32 s5, s17 -; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s9, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: s_not_b32 s5, s16 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s8, 1 -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: s_mov_b32 s16, s15 +; SI-NEXT: s_mov_b32 s17, s11 +; SI-NEXT: s_lshr_b32 s18, s11, 1 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 1 +; SI-NEXT: s_not_b32 s7, s7 +; SI-NEXT: s_mov_b32 s17, s18 +; SI-NEXT: s_and_b32 s7, s7, 31 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s7 +; SI-NEXT: s_lshr_b32 s7, s10, 1 +; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; SI-NEXT: s_not_b32 s6, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_and_b32 s6, s6, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_lshr_b32 s7, s9, 1 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 +; SI-NEXT: s_not_b32 s5, s5 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_and_b32 s5, s5, 31 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_lshr_b32 s5, s8, 1 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; SI-NEXT: s_not_b32 s4, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_and_b32 s4, s4, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: s_mov_b32 s4, s15 +; VI-NEXT: s_mov_b32 s5, s11 +; VI-NEXT: s_lshr_b32 s16, s11, 1 +; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; VI-NEXT: s_not_b32 s3, s3 -; VI-NEXT: s_lshr_b32 s6, s11, 1 -; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s14 -; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; VI-NEXT: s_mov_b32 s5, s16 +; VI-NEXT: s_and_b32 s3, s3, 31 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 ; VI-NEXT: s_lshr_b32 s3, s10, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 +; VI-NEXT: s_mov_b32 s10, s13 +; VI-NEXT: s_mov_b32 s11, s9 +; VI-NEXT: s_lshr_b32 s3, s9, 1 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 ; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s9, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 ; VI-NEXT: s_lshr_b32 s1, s8, 1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s15 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_lshr_b32 s16, s11, 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: s_lshr_b32 s4, s11, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; GFX9-NEXT: s_mov_b32 s5, s16 +; GFX9-NEXT: s_and_b32 s3, s3, 31 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 ; GFX9-NEXT: s_lshr_b32 s3, s10, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_lshr_b32 s3, s9, 1 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 ; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s9, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 ; GFX9-NEXT: s_lshr_b32 s1, s8, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -530,22 +642,40 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1 -; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1 -; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1 -; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1 -; GFX10-NEXT: s_lshr_b32 s4, s11, 1 -; GFX10-NEXT: s_not_b32 s3, s3 -; GFX10-NEXT: s_lshr_b32 s5, s10, 1 -; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: s_lshr_b32 s9, s9, 1 +; GFX10-NEXT: s_mov_b32 s4, s15 +; GFX10-NEXT: s_mov_b32 s5, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_lshr_b32 s16, s11, 1 +; GFX10-NEXT: s_not_b32 s11, s3 +; GFX10-NEXT: s_lshr_b32 s17, s10, 1 +; GFX10-NEXT: s_not_b32 s10, s2 +; GFX10-NEXT: s_lshr_b32 s18, s9, 1 +; GFX10-NEXT: s_mov_b32 s2, s13 +; GFX10-NEXT: s_mov_b32 s3, s9 +; GFX10-NEXT: s_lshr_b32 s19, s8, 1 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX10-NEXT: s_and_b32 s11, s11, 31 +; GFX10-NEXT: s_and_b32 s10, s10, 31 +; GFX10-NEXT: s_mov_b32 s5, s16 +; GFX10-NEXT: s_mov_b32 s9, s17 ; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: s_lshr_b32 s8, s8, 1 ; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3 -; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s11 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s10 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], 1 +; GFX10-NEXT: s_mov_b32 s3, s18 +; GFX10-NEXT: s_mov_b32 s11, s19 +; GFX10-NEXT: s_and_b32 s0, s0, 31 +; GFX10-NEXT: s_and_b32 s5, s1, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -555,24 +685,41 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1 -; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1 -; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1 -; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1 -; GFX11-NEXT: s_lshr_b32 s6, s11, 1 -; GFX11-NEXT: s_not_b32 s3, s3 -; GFX11-NEXT: s_lshr_b32 s7, s10, 1 -; GFX11-NEXT: s_not_b32 s2, s2 -; GFX11-NEXT: s_lshr_b32 s9, s9, 1 +; GFX11-NEXT: s_mov_b32 s6, s15 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_lshr_b32 s16, s11, 1 +; GFX11-NEXT: s_not_b32 s11, s3 +; GFX11-NEXT: s_lshr_b32 s17, s10, 1 +; GFX11-NEXT: s_not_b32 s10, s2 +; GFX11-NEXT: s_lshr_b32 s18, s9, 1 +; GFX11-NEXT: s_mov_b32 s2, s13 +; GFX11-NEXT: s_mov_b32 s3, s9 +; GFX11-NEXT: s_lshr_b32 s19, s8, 1 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX11-NEXT: s_and_b32 s11, s11, 31 +; GFX11-NEXT: s_and_b32 s10, s10, 31 +; GFX11-NEXT: s_mov_b32 s7, s16 +; GFX11-NEXT: s_mov_b32 s9, s17 ; GFX11-NEXT: s_not_b32 s1, s1 -; GFX11-NEXT: s_lshr_b32 s8, s8, 1 ; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1 -; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s11 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s10 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], 1 +; GFX11-NEXT: s_mov_b32 s3, s18 +; GFX11-NEXT: s_mov_b32 s11, s19 +; GFX11-NEXT: s_and_b32 s0, s0, 31 +; GFX11-NEXT: s_and_b32 s7, s1, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -589,14 +736,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 25 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -605,15 +758,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_mov_b32 s2, s15 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_mov_b32 s6, s13 +; VI-NEXT: s_mov_b32 s7, s9 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 23 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 25 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -624,14 +783,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; GFX9-NEXT: s_mov_b32 s2, s15 +; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_mov_b32 s6, s13 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 23 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], 25 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -660,10 +825,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31 +; GFX10-NEXT: s_mov_b32 s2, s15 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_mov_b32 s4, s13 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 25 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -672,12 +847,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31 +; GFX11-NEXT: s_mov_b32 s2, s15 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_mov_b32 s4, s13 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 31 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], 23 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 25 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index ef68f44bac203..7afb2cf317869 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -30,9 +30,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; SI-NEXT: s_mov_b32 s8, s1 +; SI-NEXT: s_mov_b32 s9, s0 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -41,11 +43,13 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; VI-NEXT: s_mov_b32 s6, s1 +; VI-NEXT: s_mov_b32 s7, s0 +; VI-NEXT: s_and_b32 s0, s2, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -55,9 +59,11 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: s_mov_b32 s4, s1 +; GFX9-NEXT: s_mov_b32 s5, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -77,62 +83,45 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s0 +; GFX10-NEXT: s_and_b32 s0, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x1 -; GFX12-TRUE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l -; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x1 -; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX12-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s1 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_and_b32 s0, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s6, s1 +; GFX12-NEXT: s_mov_b32 s7, s0 +; GFX12-NEXT: s_and_b32 s0, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) store i32 %0, ptr addrspace(1) %in @@ -146,10 +135,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 +; SI-NEXT: s_mov_b32 s0, s3 +; SI-NEXT: s_mov_b32 s1, s2 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 7 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -157,10 +148,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_mov_b32 s5, s2 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -169,8 +162,10 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s2 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -191,25 +186,34 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s2 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_mov_b32 s5, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_i32_imm: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s5, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -218,22 +222,125 @@ entry: ret void } +define amdgpu_kernel void @fshr_i32_imm_src0(ptr addrspace(1) %in, i32 %x, i32 %y) { +; SI-LABEL: fshr_i32_imm_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s9, 7 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_i32_imm_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s5, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s3 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_i32_imm_src0: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s5, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_i32_imm_src0: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T1.X, literal.x, KC0[2].W, KC0[2].Z, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_i32_imm_src0: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s5, 7 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 31 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_i32_imm_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s5, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s3 +; GFX11-NEXT: s_and_b32 s2, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_i32_imm_src0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s5, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_and_b32 s2, s2, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm +entry: + %0 = call i32 @llvm.fshr.i32(i32 7, i32 %y, i32 %x) + store i32 %0, ptr addrspace(1) %in + ret void +} + define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_mov_b32 s7, s1 +; SI-NEXT: s_and_b32 s1, s5, 31 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_and_b32 s0, s4, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: @@ -242,13 +349,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0 +; VI-NEXT: s_mov_b32 s8, s3 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_and_b32 s1, s7, 31 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_and_b32 s0, s6, 31 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -260,12 +370,15 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_and_b32 s0, s6, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; @@ -286,79 +399,62 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_and_b32 s0, s6, 31 +; GFX10-NEXT: s_and_b32 s6, s7, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_v2i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_v2i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_v2i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_v2i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_v2i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s3 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_and_b32 s0, s6, 31 +; GFX11-NEXT: s_and_b32 s6, s7, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[8:9], s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v2i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s3 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_and_b32 s0, s6, 31 +; GFX12-NEXT: s_and_b32 s6, s7, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[8:9], s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) store <2 x i32> %0, ptr addrspace(1) %in @@ -373,10 +469,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s3, s0 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 9 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -385,11 +484,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; VI-NEXT: s_mov_b32 s6, s3 +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_mov_b32 s3, s0 +; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 9 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -400,10 +502,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 9 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -428,8 +533,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX10-NEXT: s_mov_b32 s4, s3 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 9 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -438,10 +548,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; @@ -450,10 +565,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[2:3], 7 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], 9 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm entry: @@ -462,28 +582,173 @@ entry: ret void } -define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; SI-LABEL: fshr_v4i32: +define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { +; SI-LABEL: fshr_v2i32_imm_src1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s8, 9 +; SI-NEXT: s_mov_b32 s10, 7 ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_and_b32 s1, s3, 31 +; SI-NEXT: s_mov_b32 s11, s0 +; SI-NEXT: s_and_b32 s0, s2, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; SI-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_v2i32_imm_src1: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s6, 9 +; VI-NEXT: s_mov_b32 s8, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, s1 +; VI-NEXT: s_and_b32 s1, s3, 31 +; VI-NEXT: s_mov_b32 s9, s0 +; VI-NEXT: s_and_b32 s0, s2, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_v2i32_imm_src1: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s4, 9 +; GFX9-NEXT: s_mov_b32 s8, 7 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s9, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_v2i32_imm_src1: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, literal.x, KC0[3].Z, +; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, literal.x, KC0[3].Y, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v2i32_imm_src1: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s4, 9 +; GFX10-NEXT: s_mov_b32 s8, 7 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: s_mov_b32 s9, s0 +; GFX10-NEXT: s_and_b32 s0, s2, 31 +; GFX10-NEXT: s_and_b32 s2, s3, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v2i32_imm_src1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s6, 9 +; GFX11-NEXT: s_mov_b32 s8, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s1 +; GFX11-NEXT: s_mov_b32 s9, s0 +; GFX11-NEXT: s_and_b32 s0, s2, 31 +; GFX11-NEXT: s_and_b32 s2, s3, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v2i32_imm_src1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, 9 +; GFX12-NEXT: s_mov_b32 s8, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s7, s1 +; GFX12-NEXT: s_mov_b32 s9, s0 +; GFX12-NEXT: s_and_b32 s0, s2, 31 +; GFX12-NEXT: s_and_b32 s2, s3, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm +entry: + %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> , <2 x i32> %y) + store <2 x i32> %0, ptr addrspace(1) %in + ret void +} + +define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; SI-LABEL: fshr_v4i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_and_b32 s6, s19, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_and_b32 s5, s18, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], s5 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_and_b32 s5, s17, 31 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_and_b32 s5, s16, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], s5 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: @@ -492,19 +757,25 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_alignbit_b32 v2, s10, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, v4 +; VI-NEXT: s_mov_b32 s6, s15 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_and_b32 s3, s3, 31 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_and_b32 s2, s2, 31 +; VI-NEXT: s_mov_b32 s10, s13 +; VI-NEXT: s_mov_b32 s11, s9 +; VI-NEXT: s_and_b32 s1, s1, 31 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_and_b32 s0, s0, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 +; VI-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[12:13], s0 ; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -516,18 +787,24 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, v5 +; GFX9-NEXT: s_mov_b32 s4, s15 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_and_b32 s3, s3, 31 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_and_b32 s2, s2, 31 +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: s_and_b32 s1, s1, 31 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_and_b32 s0, s0, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[12:13], s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -552,101 +829,87 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: s_mov_b32 s4, s15 +; GFX10-NEXT: s_mov_b32 s5, s11 +; GFX10-NEXT: s_and_b32 s11, s3, 31 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_and_b32 s10, s2, 31 +; GFX10-NEXT: s_mov_b32 s2, s13 +; GFX10-NEXT: s_mov_b32 s3, s9 +; GFX10-NEXT: s_and_b32 s16, s1, 31 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_and_b32 s8, s0, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s11 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[14:15], s10 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_v4i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l -; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_v4i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX11-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_v4i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX12-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l -; GFX12-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_v4i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX12-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_v4i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s15 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_and_b32 s11, s3, 31 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_and_b32 s10, s2, 31 +; GFX11-NEXT: s_mov_b32 s2, s13 +; GFX11-NEXT: s_mov_b32 s3, s9 +; GFX11-NEXT: s_and_b32 s16, s1, 31 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_and_b32 s8, s0, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], s11 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], s10 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s6, s15 +; GFX12-NEXT: s_mov_b32 s7, s11 +; GFX12-NEXT: s_and_b32 s11, s3, 31 +; GFX12-NEXT: s_mov_b32 s15, s10 +; GFX12-NEXT: s_and_b32 s10, s2, 31 +; GFX12-NEXT: s_mov_b32 s2, s13 +; GFX12-NEXT: s_mov_b32 s3, s9 +; GFX12-NEXT: s_and_b32 s16, s1, 31 +; GFX12-NEXT: s_mov_b32 s13, s8 +; GFX12-NEXT: s_and_b32 s8, s0, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[6:7], s11 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[14:15], s10 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s16 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) store <4 x i32> %0, ptr addrspace(1) %in @@ -661,14 +924,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; SI-NEXT: s_mov_b32 s4, s15 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s15, s10 +; SI-NEXT: s_mov_b32 s10, s13 +; SI-NEXT: s_mov_b32 s11, s9 +; SI-NEXT: s_mov_b32 s13, s8 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 7 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -677,15 +946,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_mov_b32 s2, s15 +; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s10 +; VI-NEXT: s_mov_b32 s6, s13 +; VI-NEXT: s_mov_b32 s7, s9 +; VI-NEXT: s_mov_b32 s13, s8 +; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 9 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 7 +; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -696,14 +971,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_mov_b32 s2, s15 +; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s10 +; GFX9-NEXT: s_mov_b32 s6, s13 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 9 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], 7 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -730,10 +1011,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX10-NEXT: s_mov_b32 s2, s15 +; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_mov_b32 s15, s10 +; GFX10-NEXT: s_mov_b32 s4, s13 +; GFX10-NEXT: s_mov_b32 s5, s9 +; GFX10-NEXT: s_mov_b32 s13, s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -742,12 +1033,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX11-NEXT: s_mov_b32 s2, s15 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s15, s10 +; GFX11-NEXT: s_mov_b32 s4, s13 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s13, s8 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -756,12 +1056,21 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX12-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX12-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX12-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX12-NEXT: s_mov_b32 s2, s15 +; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_mov_b32 s15, s10 +; GFX12-NEXT: s_mov_b32 s4, s13 +; GFX12-NEXT: s_mov_b32 s5, s9 +; GFX12-NEXT: s_mov_b32 s13, s8 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[14:15], 9 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX12-NEXT: s_lshr_b64 s[4:5], s[4:5], 7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -770,6 +1079,194 @@ entry: ret void } +define amdgpu_kernel void @fshr_v4i32_imm_src0(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { +; SI-LABEL: fshr_v4i32_imm_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 33 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, s11 +; SI-NEXT: s_and_b32 s4, s15, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; SI-NEXT: s_mov_b32 s11, 9 +; SI-NEXT: s_and_b32 s5, s14, 31 +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 +; SI-NEXT: s_mov_b32 s11, 7 +; SI-NEXT: s_mov_b32 s10, s9 +; SI-NEXT: s_and_b32 s5, s13, 31 +; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 +; SI-NEXT: s_mov_b32 s9, 1 +; SI-NEXT: s_and_b32 s5, s12, 31 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s5 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fshr_v4i32_imm_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s1, 33 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s11 +; VI-NEXT: s_and_b32 s4, s15, 31 +; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; VI-NEXT: s_mov_b32 s11, 9 +; VI-NEXT: s_and_b32 s1, s14, 31 +; VI-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 +; VI-NEXT: s_mov_b32 s6, s9 +; VI-NEXT: s_and_b32 s1, s13, 31 +; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; VI-NEXT: s_mov_b32 s9, 1 +; VI-NEXT: s_and_b32 s1, s12, 31 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fshr_v4i32_imm_src0: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s1, 33 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 7 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, s11 +; GFX9-NEXT: s_and_b32 s4, s15, 31 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_mov_b32 s11, 9 +; GFX9-NEXT: s_and_b32 s1, s14, 31 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 +; GFX9-NEXT: s_mov_b32 s6, s9 +; GFX9-NEXT: s_and_b32 s1, s13, 31 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 +; GFX9-NEXT: s_mov_b32 s9, 1 +; GFX9-NEXT: s_and_b32 s1, s12, 31 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: s_endpgm +; +; R600-LABEL: fshr_v4i32_imm_src0: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: BIT_ALIGN_INT * T0.W, literal.x, KC0[4].X, KC0[5].X, +; R600-NEXT: 33(4.624285e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.Z, literal.x, KC0[3].W, KC0[4].W, +; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.Y, literal.x, KC0[3].Z, KC0[4].Z, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T0.X, 1, KC0[3].Y, KC0[4].Y, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; GFX10-LABEL: fshr_v4i32_imm_src0: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 33 +; GFX10-NEXT: s_mov_b32 s3, 7 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s11 +; GFX10-NEXT: s_and_b32 s4, s15, 31 +; GFX10-NEXT: s_mov_b32 s11, 9 +; GFX10-NEXT: s_and_b32 s5, s14, 31 +; GFX10-NEXT: s_mov_b32 s2, s9 +; GFX10-NEXT: s_and_b32 s13, s13, 31 +; GFX10-NEXT: s_mov_b32 s9, 1 +; GFX10-NEXT: s_and_b32 s12, s12, 31 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v4i32_imm_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s1, 33 +; GFX11-NEXT: s_mov_b32 s3, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s11 +; GFX11-NEXT: s_and_b32 s6, s15, 31 +; GFX11-NEXT: s_mov_b32 s11, 9 +; GFX11-NEXT: s_and_b32 s7, s14, 31 +; GFX11-NEXT: s_mov_b32 s2, s9 +; GFX11-NEXT: s_and_b32 s13, s13, 31 +; GFX11-NEXT: s_mov_b32 s9, 1 +; GFX11-NEXT: s_and_b32 s12, s12, 31 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v4i32_imm_src0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s1, 33 +; GFX12-NEXT: s_mov_b32 s3, 7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s0, s11 +; GFX12-NEXT: s_and_b32 s6, s15, 31 +; GFX12-NEXT: s_mov_b32 s11, 9 +; GFX12-NEXT: s_and_b32 s7, s14, 31 +; GFX12-NEXT: s_mov_b32 s2, s9 +; GFX12-NEXT: s_and_b32 s13, s13, 31 +; GFX12-NEXT: s_mov_b32 s9, 1 +; GFX12-NEXT: s_and_b32 s12, s12, 31 +; GFX12-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 +; GFX12-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 +; GFX12-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 +; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX12-NEXT: s_endpgm +entry: + %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> , <4 x i32> %x, <4 x i32> %y) + store <4 x i32> %0, ptr addrspace(1) %in + ret void +} + define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { ; GFX89-LABEL: v_fshr_i32: ; GFX89: ; %bb.0: @@ -2091,29 +2588,109 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fshr_v2i24: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 -; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fshr_v2i24: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fshr_v2i24: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_fshr_v2i24: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l +; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_fshr_v2i24: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX12-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll index 4aa49f2c9296d..1db476300c261 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll +++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @foo() { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 -; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 -; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3] +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: flat_store_b64 v[1:2], v[0:1] ; CHECK-NEXT: s_endpgm entry: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir index 6b9b77c228350..437b4e8b9b493 100644 --- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir @@ -1112,11 +1112,11 @@ body: | ; GCN-NEXT: S_WAITCNT 0 ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2 + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, killed renamable $vgpr2 ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, killed renamable $vgpr2 S_SETPC_B64_return undef $sgpr30_sgpr31 ... diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 424aaaea11722..c8ebe0bb6405d 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -82,9 +82,9 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s69, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 ; CHECK-NEXT: v_writelane_b32 v7, s67, 31 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s52, v7, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_readlane_b32 s53, v7, 1 ; CHECK-NEXT: v_readlane_b32 s54, v7, 2 ; CHECK-NEXT: v_readlane_b32 s55, v7, 3 @@ -97,14 +97,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s60, v7, 8 ; CHECK-NEXT: v_readlane_b32 s61, v7, 9 ; CHECK-NEXT: v_readlane_b32 s62, v7, 10 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s63, v7, 11 ; CHECK-NEXT: v_readlane_b32 s64, v7, 12 ; CHECK-NEXT: v_readlane_b32 s65, v7, 13 ; CHECK-NEXT: v_readlane_b32 s66, v7, 14 ; CHECK-NEXT: v_readlane_b32 s67, v7, 15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 +; CHECK-NEXT: v_mul_f32_e32 v0, v4, v1 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 @@ -118,13 +118,13 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s65, v7, 29 ; CHECK-NEXT: v_readlane_b32 s66, v7, 30 ; CHECK-NEXT: v_readlane_b32 s67, v7, 31 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: v_readlane_b32 s53, v7, 17 ; CHECK-NEXT: v_readlane_b32 s54, v7, 18 ; CHECK-NEXT: v_readlane_b32 s55, v7, 19 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s56, v7, 20 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: v_readlane_b32 s57, v7, 21 ; CHECK-NEXT: v_readlane_b32 s58, v7, 22 ; CHECK-NEXT: v_readlane_b32 s59, v7, 23 diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index 1445f6b7b58be..382a8d38fd652 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -486,7 +486,7 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_ENDPGM 0 bb.0: S_NOP 0, implicit-def $agpr0 @@ -516,7 +516,7 @@ body: | S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 + INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 S_ENDPGM 0 ... @@ -1368,7 +1368,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1408,7 +1408,7 @@ body: | undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 @@ -1726,7 +1726,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1763,7 +1763,7 @@ body: | undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39321609 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 82b9458d09c80..9e1d59064cb5e 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9764874 /* regdef:SGPR_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9764873 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9764874 /* regdef:SGPR_128 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9764873 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7012361 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8519690 /* regdef:AReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:AReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9240586 /* regdef:AReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 76016e46426bd..92ea83fdfb982 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -238,11 +238,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -256,11 +256,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s0, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -312,16 +312,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s3, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s3 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -334,16 +334,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s3, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s3 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; @@ -405,19 +405,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s3, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s3 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s5 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -430,19 +430,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s3, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s3 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s5 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index f705a2ffc4f1d..5e2cec504c6a9 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5779,19 +5779,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8 +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v8 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5835,19 +5833,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8 +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v8 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5887,19 +5883,17 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v8 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v2, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v3, v[5:6] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] @@ -5989,28 +5983,26 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX10-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v4, v7, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v3, v[1:2] ; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v5, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v7, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v6, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v10, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v8, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v4, v7, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v1, v2, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v6, v9, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6] +; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v4, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v9, v2, v[7:8] +; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v4, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v0, v[6:7] +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v2, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v2, v8, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v4, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v2, v7, v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6049,37 +6041,35 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v3, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v4, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v5, v8, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v3, v[1:2] +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v1, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v6, v9, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v1, v3, v[5:6] ; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v5, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v6, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v6, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v4, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v2, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v6, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v12, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v9, 0 +; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v4, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, 0 ; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v11, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v8, v11, v[2:3] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v5, v10, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[6:7] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6408,52 +6398,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v16 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX7-GISEL-NEXT: v_add_i32_e64 v16, s[4:5], 1, v8 +; GFX7-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6513,52 +6497,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, v1, v16 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[4:5], 1, v8 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6610,52 +6588,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v12 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v10, v13, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, v1, v16 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v17, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v1 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] +; GFX900-GISEL-NEXT: v_add_co_u32_e64 v16, s[4:5], 1, v8 +; GFX900-GISEL-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v13, s[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v14, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6805,50 +6777,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v14, v6, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v12, v5, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v14, v7, v[3:4] -; GFX10-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v13, v4, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v3, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, v10, v13, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v2, v14 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v6, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v8, v15, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v10, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v11, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v5, v[0:1] -; GFX10-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v17, v7, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v8, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v15, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v16, v4, v[14:15] -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v11, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v18, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v11, v20, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v13, v12, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v13, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v5, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v4, v19, v[10:11] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v7, v17, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v15, v[11:12] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v16, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v7, v18, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v12, v14, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v17, v[7:8] +; GFX10-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4] +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v11 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v1, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, v16 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v13, v17, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v16, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v16, v7, v[11:12] +; GFX10-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v11, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v8, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v6, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v10, v12, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v16, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v10, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v14, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v8, v0, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v17, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v10, v1, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v16, v11, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v4, v15, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v14, v12, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v6, v18, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v7, v13, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v8, v17, v[9:10] ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6911,63 +6879,60 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v13, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v3, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v4, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v15, v6, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v5, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v5, v[1:2] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v15, v7, v[3:4] -; GFX11-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v14, v4, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v16, v6, v[9:10] -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, v10, v14, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v15 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v3, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v11, v16, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v18, v6, 0 -; GFX11-GISEL-NEXT: v_add_co_u32 v20, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v8, v20, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v5, v[0:1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v7, v[1:2] -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v11, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v7, v[3:4] +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v11 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v15, v4, v[8:9] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v17, v4, v[13:14] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v12, v18, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v19, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v21, v[0:1] -; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v15, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v12, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v12, v22, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v17, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v15, v20, v[6:7] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v10, v18, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v9, v16, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v4, v19, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v1, v4, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v15, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v16 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v13, v17, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v18, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v1, v5, v[9:10] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v18, v7, v[11:12] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v3, v4, v[14:15] +; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v11, v14, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v17, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v12, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v14, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v19, v6, v[15:16] +; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v8, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v10, v15, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v16, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v10, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v17, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v8, v0, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v1, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v19, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v14, v[11:12] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v17, v15, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v18, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v6, v20, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v13, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v19, v[11:12] ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 69a871f6f6ae5..fa0568d307907 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -262,12 +262,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v2: @@ -275,12 +275,12 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v2: @@ -290,13 +290,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v2: @@ -306,13 +306,13 @@ define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -462,13 +462,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_misaligned_v3: @@ -476,13 +475,12 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v5, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[0:2], v[5:6] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v4, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx3 v[1:3], v[4:5] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[5:6], v[2:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx3 v[4:5], v[0:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -492,13 +490,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_misaligned_v3: @@ -508,13 +506,13 @@ define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v5, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v4, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b96 v[0:2], v[5:6] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b96 v[1:3], v[4:5] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: flat_store_b96 v[5:6], v[2:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX11-NEXT: flat_store_b96 v[4:5], v[0:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -606,36 +604,33 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; SPLIT: ; %bb.0: ; %bb ; SPLIT-NEXT: s_load_dword s0, s[4:5], 0x24 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; SPLIT-NEXT: ds_read_b96 v[0:2], v5 +; SPLIT-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; SPLIT-NEXT: ds_read_b96 v[1:3], v4 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v3, v0 -; SPLIT-NEXT: v_mov_b32_e32 v4, v1 -; SPLIT-NEXT: ds_write_b96 v5, v[2:4] +; SPLIT-NEXT: v_mov_b32_e32 v0, v3 +; SPLIT-NEXT: ds_write_b96 v4, v[0:2] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_local_aligned_v3: ; ALIGNED-GFX10: ; %bb.0: ; %bb ; ALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; ALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; ALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; ALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_local_aligned_v3: ; UNALIGNED-GFX10: ; %bb.0: ; %bb ; UNALIGNED-GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; UNALIGNED-GFX10-NEXT: ds_read_b96 v[0:2], v5 +; UNALIGNED-GFX10-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX10-NEXT: ds_read_b96 v[1:3], v4 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v3, v0 -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v1 -; UNALIGNED-GFX10-NEXT: ds_write_b96 v5, v[2:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX10-NEXT: ds_write_b96 v4, v[0:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -644,11 +639,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; ALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; ALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; ALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; ALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; ALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; ALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_local_aligned_v3: @@ -657,11 +652,11 @@ define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) { ; UNALIGNED-GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v5, v0, 2, s0 -; UNALIGNED-GFX11-NEXT: ds_load_b96 v[0:2], v5 +; UNALIGNED-GFX11-NEXT: v_lshl_add_u32 v4, v0, 2, s0 +; UNALIGNED-GFX11-NEXT: ds_load_b96 v[1:3], v4 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 -; UNALIGNED-GFX11-NEXT: ds_store_b96 v5, v[2:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; UNALIGNED-GFX11-NEXT: ds_store_b96 v4, v[0:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -683,12 +678,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; SPLIT-NEXT: v_add_co_u32 v3, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v2 -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 +; SPLIT-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_aligned_v2: @@ -696,12 +691,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; ALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; ALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; ALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; ALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; ALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; ALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; ALIGNED-GFX10-NEXT: s_endpgm ; ; UNALIGNED-GFX10-LABEL: test_flat_aligned_v2: @@ -709,12 +704,12 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; UNALIGNED-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; UNALIGNED-GFX10-NEXT: v_add_co_u32 v3, s0, s0, v0 +; UNALIGNED-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s1, 0, s0 +; UNALIGNED-GFX10-NEXT: flat_load_dwordx2 v[0:1], v[3:4] ; UNALIGNED-GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; UNALIGNED-GFX10-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX10-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; UNALIGNED-GFX10-NEXT: s_endpgm ; ; ALIGNED-GFX11-LABEL: test_flat_aligned_v2: @@ -724,13 +719,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; ALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; ALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; ALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; ALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; ALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; ALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; ALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; ALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; ALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; ALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; ALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; ALIGNED-GFX11-NEXT: s_endpgm ; ; UNALIGNED-GFX11-LABEL: test_flat_aligned_v2: @@ -740,13 +735,13 @@ define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) { ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; UNALIGNED-GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0 +; UNALIGNED-GFX11-NEXT: v_add_co_u32 v3, s0, s0, v0 ; UNALIGNED-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; UNALIGNED-GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; UNALIGNED-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, s1, 0, s0 +; UNALIGNED-GFX11-NEXT: flat_load_b64 v[0:1], v[3:4] ; UNALIGNED-GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v4, v2 -; UNALIGNED-GFX11-NEXT: flat_store_b64 v[0:1], v[3:4] +; UNALIGNED-GFX11-NEXT: v_mov_b32_e32 v2, v0 +; UNALIGNED-GFX11-NEXT: flat_store_b64 v[3:4], v[1:2] ; UNALIGNED-GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -942,21 +937,19 @@ define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) { ; SPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SPLIT-NEXT: s_waitcnt lgkmcnt(0) -; SPLIT-NEXT: v_add_co_u32 v0, s0, s0, v0 -; SPLIT-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; SPLIT-NEXT: v_add_co_u32 v2, vcc_lo, v0, 8 -; SPLIT-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; SPLIT-NEXT: v_add_co_u32 v6, s0, s0, v0 +; SPLIT-NEXT: v_add_co_ci_u32_e64 v7, s0, s1, 0, s0 +; SPLIT-NEXT: v_add_co_u32 v8, vcc_lo, v6, 8 +; SPLIT-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo ; SPLIT-NEXT: s_clause 0x1 -; SPLIT-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; SPLIT-NEXT: flat_load_dwordx2 v[6:7], v[2:3] +; SPLIT-NEXT: flat_load_dwordx2 v[0:1], v[6:7] +; SPLIT-NEXT: flat_load_dwordx2 v[3:4], v[8:9] ; SPLIT-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; SPLIT-NEXT: v_mov_b32_e32 v8, v5 -; SPLIT-NEXT: v_mov_b32_e32 v9, v4 +; SPLIT-NEXT: v_mov_b32_e32 v2, v0 ; SPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SPLIT-NEXT: v_mov_b32_e32 v4, v7 -; SPLIT-NEXT: v_mov_b32_e32 v5, v6 -; SPLIT-NEXT: flat_store_dwordx2 v[2:3], v[8:9] -; SPLIT-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; SPLIT-NEXT: v_mov_b32_e32 v5, v3 +; SPLIT-NEXT: flat_store_dwordx2 v[8:9], v[1:2] +; SPLIT-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; SPLIT-NEXT: s_endpgm ; ; ALIGNED-GFX10-LABEL: test_flat_v4_aligned8: diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir index 02eda2c4822c2..f5dbfb85efb33 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -5,7 +5,35 @@ # source. # No more registers shall be defined --- -name: main +name: limit_coalesce +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr16, $sgpr17 + + ; CHECK-LABEL: name: limit_coalesce + ; CHECK: liveins: $sgpr16, $sgpr17 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:sgpr_64 = COPY $sgpr17 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr16 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: INLINEASM &"; def $0", 0 /* attdialect */, 3407882 /* regdef:AReg_64 */, def %4 + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_128 = COPY %4.sub1 + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: SI_RETURN + %0:sgpr_32 = COPY killed $sgpr17 + %1:sgpr_32 = COPY killed $sgpr16 + undef %2.sub0:sgpr_64 = COPY killed %1 + %2.sub1:sgpr_64 = COPY killed %0 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + INLINEASM &"; def $0", 0 /* attdialect */, 3407882 /* regdef:VReg_64 */, def %4:vreg_64 + undef %5.sub0:vreg_128 = COPY killed %4.sub1 + GLOBAL_STORE_DWORDX4_SADDR killed %3, killed %5, killed %2, 0, 0, implicit $exec :: (store (s128), addrspace 1) + SI_RETURN +... + +--- +name: allow_coalesce tracksRegLiveness: true registers: - { id: 0, class: sreg_32_xm0, preferred-register: '%0' } @@ -14,23 +42,18 @@ body: | bb.0: liveins: $sgpr0, $vgpr0_vgpr1 - ; CHECK-LABEL: name: main + ; CHECK-LABEL: name: allow_coalesce ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0 - ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0 - ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0 - ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $sgpr0 + ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY]].sub0_sub1, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF]] + ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_128 = COPY undef [[COPY]].sub2 + ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr %2:vreg_64 = IMPLICIT_DEF undef %3.sub0:vreg_64 = COPY $sgpr0 %3.sub1:vreg_64 = COPY %2.sub0 @@ -49,3 +72,4 @@ body: | FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr ... + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 8bb7274c84620..76b97e843d777 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -78,7 +78,6 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 @@ -93,12 +92,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[1:2] ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -113,16 +112,15 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 -; CI-NEXT: v_mov_b32_e32 v2, v1 -; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[1:2] ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 6f63384be90fd..2d60c5729ed52 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9775,17 +9775,17 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s4, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s2, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff +; GFX6-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s6, s2 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -9800,15 +9800,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16 -; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: s_and_b32 s4, s2, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s5, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_or_b32 s1, s4, s5 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -9820,15 +9820,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s2, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s2, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s5, s0, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10062,26 +10062,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s9, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s11, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s6, s5 +; GFX6-NOHSA-NEXT: s_lshl_b32 s9, s9, 8 ; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s2 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s10, s9 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s11, s8 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10096,24 +10098,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-HSA-NEXT: s_and_b32 s5, s3, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s6, s3, 0xff +; GFX7-HSA-NEXT: s_and_b32 s7, s2, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4 -; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24 +; GFX7-HSA-NEXT: s_or_b32 s5, s6, s5 +; GFX7-HSA-NEXT: s_or_b32 s6, s7, s0 +; GFX7-HSA-NEXT: s_mov_b32 s0, s3 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_mov_b32 s3, s4 +; GFX7-HSA-NEXT: s_and_b32 s7, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -10122,28 +10126,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24 ; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s5, s3, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s5, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s1 +; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s5, s3 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s6, s1, s3 +; GFX8-NOHSA-NEXT: s_mov_b32 s3, s0 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10500,43 +10505,48 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s13, s6, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s17, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s19, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s20, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s21, s6, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s8, s7 +; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s15 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s14, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s11, s13, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s7, s12 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s18, s17 +; GFX6-NOHSA-NEXT: s_or_b32 s9, s19, s16 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s10, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s5, s20, s5 +; GFX6-NOHSA-NEXT: s_or_b32 s11, s21, s11 +; GFX6-NOHSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s10 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10549,48 +10559,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s9, s7, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24 -; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff +; GFX7-HSA-NEXT: s_and_b32 s13, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s14, s5, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24 +; GFX7-HSA-NEXT: s_or_b32 s13, s14, s13 +; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s9, s5, 24 +; GFX7-HSA-NEXT: s_or_b32 s14, s14, s8 +; GFX7-HSA-NEXT: s_mov_b32 s8, s5 +; GFX7-HSA-NEXT: s_mov_b32 s5, s12 +; GFX7-HSA-NEXT: s_and_b32 s11, s7, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GFX7-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s5, s7, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s9, s11, 8 +; GFX7-HSA-NEXT: s_or_b32 s5, s5, s9 +; GFX7-HSA-NEXT: s_and_b32 s9, s6, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12 -; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10 -; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8 -; GFX7-HSA-NEXT: s_or_b32 s2, s3, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s3, s7, 24 +; GFX7-HSA-NEXT: s_or_b32 s9, s9, s2 +; GFX7-HSA-NEXT: s_mov_b32 s2, s7 +; GFX7-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GFX7-HSA-NEXT: s_mov_b32 s7, s10 +; GFX7-HSA-NEXT: s_and_b32 s11, s2, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[2:3], s[6:7], 16 +; GFX7-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -10601,50 +10615,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 24 ; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s3, s4 -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s9, s3 -; GFX8-NOHSA-NEXT: s_and_b32 s9, s7, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 24 ; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5 -; GFX8-NOHSA-NEXT: s_or_b32 s7, s9, s7 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s9, s9, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s4, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff +; GFX8-NOHSA-NEXT: s_or_b32 s10, s10, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s5, s2 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s11, s3 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[2:3], s[4:5], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff00ff +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s3, s2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s7, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s7, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s2, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s6, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s7, s8 +; GFX8-NOHSA-NEXT: s_or_b32 s13, s2, s3 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[2:3], s[6:7], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -11272,81 +11288,92 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s21, s6, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s22, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s24, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s25, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s3, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s27, s2, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s28, s3, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s1, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s30, s0, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s31, s1, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s33, s1, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s34, s0, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s18, s1 +; GFX6-NOHSA-NEXT: s_and_b32 s35, s3, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s36, s2, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s16, s3 +; GFX6-NOHSA-NEXT: s_and_b32 s37, s5, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s38, s4, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s14, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s39, s7, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s40, s6, 0xff +; GFX6-NOHSA-NEXT: s_mov_b32 s12, s7 +; GFX6-NOHSA-NEXT: s_lshl_b32 s31, s31, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s30, s30, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[18:19], s[18:19], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s1, s29 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s1, s28, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s19, s27, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, s26 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s3, s25, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s24, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s5, s23 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s22, 8 +; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s21, 8 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GFX6-NOHSA-NEXT: s_mov_b32 s7, s20 +; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s33, s31 +; GFX6-NOHSA-NEXT: s_or_b32 s13, s34, s30 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s18, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s1, s35, s1 +; GFX6-NOHSA-NEXT: s_or_b32 s19, s36, s19 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s16, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s3, s37, s3 +; GFX6-NOHSA-NEXT: s_or_b32 s17, s38, s17 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s14, 0xff00ff +; GFX6-NOHSA-NEXT: s_or_b32 s5, s39, s5 +; GFX6-NOHSA-NEXT: s_or_b32 s15, s40, s15 +; GFX6-NOHSA-NEXT: s_and_b32 s12, s12, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2 -; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8 -; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -11354,99 +11381,106 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s25, s1, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s25, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s23, s0, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s23, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s21, s3, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24 +; GFX7-HSA-NEXT: s_and_b32 s22, s1, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s12, s0, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s23, s1, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s22, s22, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s21, s0, 24 +; GFX7-HSA-NEXT: s_or_b32 s22, s23, s22 +; GFX7-HSA-NEXT: s_and_b32 s23, s0, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s13, s1, 24 +; GFX7-HSA-NEXT: s_or_b32 s23, s23, s12 +; GFX7-HSA-NEXT: s_mov_b32 s12, s1 +; GFX7-HSA-NEXT: s_mov_b32 s1, s21 ; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_and_b32 s19, s2, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s13, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24 -; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 +; GFX7-HSA-NEXT: s_or_b32 s20, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s19, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 24 +; GFX7-HSA-NEXT: s_or_b32 s19, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24 +; GFX7-HSA-NEXT: s_mov_b32 s0, s3 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_mov_b32 s3, s18 +; GFX7-HSA-NEXT: s_and_b32 s17, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s21, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s2, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24 -; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 -; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s17, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s11, s5, 24 +; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 s10, s5 +; GFX7-HSA-NEXT: s_or_b32 s17, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[10:11], 16 +; GFX7-HSA-NEXT: s_mov_b32 s5, s16 +; GFX7-HSA-NEXT: s_and_b32 s15, s7, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s10, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; GFX7-HSA-NEXT: s_and_b32 s14, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s4, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0 -; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8 -; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8 +; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 +; GFX7-HSA-NEXT: s_and_b32 s0, s6, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 +; GFX7-HSA-NEXT: s_or_b32 s11, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s7, 24 +; GFX7-HSA-NEXT: s_mov_b32 s0, s7 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GFX7-HSA-NEXT: s_and_b32 s14, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX7-HSA-NEXT: s_and_b32 s12, s12, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -11463,90 +11497,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 ; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14 ; GFX8-NOHSA-NEXT: s_or_b32 s15, s16, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s16, s0, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0 +; GFX8-NOHSA-NEXT: s_and_b32 s16, s16, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s16 +; GFX8-NOHSA-NEXT: s_mov_b32 s1, s13 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s13, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s17, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s18, s0, s1 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s2, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s0, s1 +; GFX8-NOHSA-NEXT: s_mov_b32 s3, s12 +; GFX8-NOHSA-NEXT: s_or_b32 s19, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s12, v0, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s12, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s0, s1 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s4, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s0, s1 +; GFX8-NOHSA-NEXT: s_mov_b32 s5, s11 +; GFX8-NOHSA-NEXT: s_or_b32 s20, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s0, 0xff00ff ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s7, 0x80010 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s1, s0 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s7, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s7, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s6, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s6, s7, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s0, s1 +; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s6, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s6, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX8-NOHSA-NEXT: s_mov_b32 s7, s10 +; GFX8-NOHSA-NEXT: s_or_b32 s21, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 +; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s10, v0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 59f4a9d44bbdd..d23c49165ec70 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5985,14 +5985,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -6011,23 +6010,22 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: @@ -6044,11 +6042,10 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index cb17f01853221..0c399d65d01cc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -9828,14 +9828,14 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v3, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NOHSA-SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00ff, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v4i8_to_v4i16: @@ -9847,18 +9847,18 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dword v2, v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_load_dword v0, v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GCN-HSA-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-HSA-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 +; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[4:5] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i8_to_v4i16: @@ -9877,10 +9877,10 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xff0000, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[0:1] +; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -10179,33 +10179,39 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i16: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s0, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s0, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s0, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s10, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s11, s8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s0, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i16: @@ -10221,20 +10227,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v7, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v7, 8, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v8, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v9, v7 +; GCN-HSA-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-HSA-NEXT: s_lshr_b32 s1, s0, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_and_b32 s4, s0, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s5, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xff +; GCN-HSA-NEXT: s_and_b32 s7, s0, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-HSA-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 +; GCN-HSA-NEXT: s_or_b32 s1, s6, s5 +; GCN-HSA-NEXT: s_or_b32 s3, s7, s4 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -10252,22 +10264,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v1 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s6, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v2, v2, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v1 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s6, s5 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s7, s4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v2 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s8, s6, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s4, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s9, s6 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -10763,35 +10779,48 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v12, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v13, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v14, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v15, v4 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s4, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s8, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s10, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s10, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s8, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s4, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s13, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s12, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s16, s15 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s17, s14 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s18, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s19, s11 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -10805,43 +10834,55 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v15, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v13, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v18, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v17, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s11, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s12, s6, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff +; GCN-HSA-NEXT: s_and_b32 s15, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s4, 0xff +; GCN-HSA-NEXT: s_and_b32 s17, s2, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-HSA-NEXT: s_lshl_b32 s7, s11, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s10, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_or_b32 s3, s14, s13 +; GCN-HSA-NEXT: s_or_b32 s5, s15, s12 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s7, s16, s7 +; GCN-HSA-NEXT: s_or_b32 s9, s17, s9 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16: @@ -10858,42 +10899,50 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xff0000, v1 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff0000, v5 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s9, 24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s8, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s16, s8, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s19, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s11, s9, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s9, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s19, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s11, s10 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s16, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s17, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s13, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v16i8_to_v16i16: @@ -11766,71 +11815,97 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v3 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v2 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v10 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s14, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s16, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s4, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s8, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s10, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s12, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s14, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s12, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s14, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s16, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s18, 24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s16, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s18, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s18, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s16, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s14, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s12, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s10, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s8, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s4, 0xff +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s27, s27, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s26, s26, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[18:19], s[18:19], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s17, s25, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s19, s24, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s13, s23, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s22, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s21, 8 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s20, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s28, s27 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s29, s26 +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s17, s30, s17 +; GCN-NOHSA-SI-NEXT: s_or_b32 s19, s31, s19 +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s33, s13 +; GCN-NOHSA-SI-NEXT: s_or_b32 s15, s34, s15 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s35, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s36, s11 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v6, v9, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v12, v4, v11, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v13, v2, v10, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v14, v14, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v15, v15, v0, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v9, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v11, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v10, v3 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v16 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -11843,88 +11918,112 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_and_b32 s12, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s6, 0xff00 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s15, s10, 0xff00 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[10:11], 16 +; GCN-HSA-NEXT: s_and_b32 s17, s8, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_and_b32 s3, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s10, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s13, 8 +; GCN-HSA-NEXT: s_and_b32 s10, s4, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s11, s12, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-HSA-NEXT: s_or_b32 s7, s17, s14 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s3, s3, s9 +; GCN-HSA-NEXT: s_or_b32 s9, s10, s11 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GCN-HSA-NEXT: s_or_b32 s5, s16, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s11, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s12, s6, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s13, s8, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff +; GCN-HSA-NEXT: s_and_b32 s15, s6, 0xff +; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-HSA-NEXT: s_and_b32 s16, s4, 0xff +; GCN-HSA-NEXT: s_and_b32 s17, s2, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-HSA-NEXT: s_lshl_b32 s7, s11, 8 +; GCN-HSA-NEXT: s_lshl_b32 s9, s10, 8 +; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-HSA-NEXT: s_or_b32 s3, s14, s13 +; GCN-HSA-NEXT: s_or_b32 s5, s15, s12 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-HSA-NEXT: s_or_b32 s7, s16, s7 +; GCN-HSA-NEXT: s_or_b32 s9, s17, s9 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00ff ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff00, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v6, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-HSA-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v6, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[6:9] -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v9, v5, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v7, v4, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v12, v8 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v13, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v9 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v13 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v12, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v12, v19, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v9 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v10, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v12 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v11, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16: @@ -11942,79 +12041,95 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v7 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v5 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 8, v6 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v10, 8, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff0000, v5 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v13, v8, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff0000, v11 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8 -; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11 -; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v7, v7, v4, 16 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff0000, v9 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xff0000, v10 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v13 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6 -; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v7 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s8, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s10, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s15, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s20, s10, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s8, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s25, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s13, 24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s30, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s12, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s33, s12, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s12, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s36, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s17, s15, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s15, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, 0xff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s14, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s22, s14, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s14, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s27, s13, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s13, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s16, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s26, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s31, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s36, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s21, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s25, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s30, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s17, s16 +; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s19, s20 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s33, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s34, s12 +; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s35, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s18, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s22, s11 +; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s23, s14 +; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s24, s21 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff00ff +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s27, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s28, s13 +; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s29, s25 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff00ff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i8_to_v32i16: diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index 062a985dd7180..b4c0b7497b95f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -5734,20 +5734,19 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: v_mov_b32_e32 v9, s0 +; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v5, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v4, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; SI-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v9, v[5:6], v[7:8] offset1:1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 +; SI-NEXT: v_bfe_i32 v4, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; SI-NEXT: ds_write2_b64 v8, v[4:5], v[2:3] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v8, v[0:1], v[6:7] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5757,20 +5756,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[1:2], v[7:8] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5779,20 +5778,20 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v4i16_to_v4i64: @@ -5846,22 +5845,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 -; VI-DS128-NEXT: ds_read_b64 v[0:1], v0 +; VI-DS128-NEXT: ds_read_b64 v[1:2], v0 ; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] +; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64: @@ -5869,22 +5867,21 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-DS128-NEXT: ds_read_b64 v[1:2], v0 ; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] ; GFX9-DS128-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(3) %in %ext = sext <4 x i16> %load to <4 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir index e6a52342337f3..8ea9ec397fe06 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir @@ -18,21 +18,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -53,27 +53,27 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 %2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir index 98b7f4a5aa1c5..71c47c80ae357 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir @@ -21,9 +21,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -31,9 +31,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -41,10 +41,10 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -52,9 +52,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets @@ -62,15 +62,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -88,42 +88,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX803: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX900: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX942: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX10: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc ; GFX12: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1, implicit $vcc + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1, implicit $vcc SI_RETURN ... @@ -144,9 +144,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -154,9 +154,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -164,10 +164,10 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -175,9 +175,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets @@ -185,15 +185,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_CO_U32_e32 8, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 16, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -214,9 +214,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]] - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_CO_U32_e64_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -224,9 +224,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -234,9 +234,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -244,9 +244,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets @@ -254,14 +254,14 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]] - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -279,42 +279,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX803: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX900: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX942: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]] %0:vgpr_32, %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN implicit %2 ... @@ -332,42 +332,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN ... @@ -385,42 +385,42 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 %1:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN ... @@ -443,9 +443,9 @@ body: | ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -454,9 +454,9 @@ body: | ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -465,9 +465,9 @@ body: | ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -476,9 +476,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets @@ -487,17 +487,17 @@ body: | ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = COPY $sgpr4 %1:sreg_32 = COPY $sgpr5 %2:sreg_32 = S_ADD_I32 %0, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %2 %3:sreg_32 = S_ADD_I32 %1, %stack.0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %3 SI_RETURN ... @@ -520,9 +520,9 @@ body: | ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -531,9 +531,9 @@ body: | ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -542,9 +542,9 @@ body: | ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -553,9 +553,9 @@ body: | ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute @@ -564,17 +564,17 @@ body: | ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN %0:sreg_32 = COPY $sgpr4 %1:sreg_32 = COPY $sgpr5 %2:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %2 %3:sreg_32 = S_ADD_I32 %stack.0, %1, implicit-def dead $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %3 SI_RETURN ... @@ -592,48 +592,48 @@ body: | bb.0: ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX803-NEXT: S_NOP 0, implicit $scc ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX803-NEXT: SI_RETURN implicit $scc ; ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX900-NEXT: S_NOP 0, implicit $scc ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX900-NEXT: SI_RETURN implicit $scc ; ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX942-NEXT: S_NOP 0, implicit $scc ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX942-NEXT: SI_RETURN implicit $scc ; ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX10-NEXT: S_NOP 0, implicit $scc ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX10-NEXT: SI_RETURN implicit $scc ; ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_]] ; GFX12-NEXT: S_NOP 0, implicit $scc ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, [[S_ADD_I32_1]] ; GFX12-NEXT: SI_RETURN implicit $scc %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %0 S_NOP 0, implicit $scc %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:SReg_32 */, %1 SI_RETURN implicit $scc ... @@ -656,9 +656,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -667,9 +667,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -678,9 +678,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -689,9 +689,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets @@ -700,15 +700,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -731,9 +731,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -742,9 +742,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -753,9 +753,9 @@ body: | ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -764,9 +764,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute @@ -775,15 +775,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -805,9 +805,9 @@ body: | ; GFX803-NEXT: {{ $}} ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -815,9 +815,9 @@ body: | ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -825,9 +825,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -836,9 +836,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets @@ -848,16 +848,16 @@ body: | ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]] ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY1]], implicit-def dead $vcc, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -880,9 +880,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -891,9 +891,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -903,10 +903,10 @@ body: | ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY1]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -915,9 +915,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets @@ -926,15 +926,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -957,9 +957,9 @@ body: | ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX803-NEXT: SI_RETURN ; ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -968,9 +968,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -980,10 +980,10 @@ body: | ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY1]], 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -992,9 +992,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute @@ -1003,15 +1003,15 @@ body: | ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0 ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]] ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir index 19ca463d7ecbb..f0868ffeeb7c5 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir @@ -20,16 +20,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets @@ -37,21 +37,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -72,16 +72,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets @@ -89,21 +89,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -124,16 +124,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets @@ -141,21 +141,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -178,9 +178,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -188,9 +188,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -199,9 +199,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets @@ -209,15 +209,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -240,9 +240,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -250,9 +250,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -261,9 +261,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute @@ -271,15 +271,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %vgpr_offset:vgpr_32 = COPY $vgpr0 %0:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -301,9 +301,9 @@ body: | ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -311,9 +311,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -322,9 +322,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets @@ -332,15 +332,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]] ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -363,9 +363,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -373,9 +373,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -384,9 +384,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets @@ -394,15 +394,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -425,9 +425,9 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -435,9 +435,9 @@ body: | ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -446,9 +446,9 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute @@ -456,15 +456,15 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8 ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %sgpr_offset:sreg_32 = COPY $sgpr8 %0:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... @@ -486,16 +486,16 @@ body: | ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec - ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX900-NEXT: SI_RETURN ; ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec - ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX942-NEXT: SI_RETURN ; ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier @@ -503,21 +503,21 @@ body: | ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]] ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec - ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] ; GFX12-NEXT: SI_RETURN %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, /*clamp*/1, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %0 %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, /*clamp*/1, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index c117473581746..1e8a2d3ad9163 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -6429,7 +6429,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22, 1835017 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]] + ; GFX908-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %22, 1245193 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x80000000) @@ -6478,7 +6478,7 @@ body: | %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4 + INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1245194 /* regdef:VGPR_32 */, def %22:vgpr_32, 1245193 /* reguse:VGPR_32 */, %4 %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 08ec0c847e941..87d52684e588c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -632,12 +632,12 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX1100-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX1100-NEXT: v_mov_b32_e32 v3, v1 +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v5, v4, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_i64_i32_extops_i32_i64: @@ -775,13 +775,13 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100-LABEL: mad_u64_u32_bitops_lhs_mask_small: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0 -; GFX1100-NEXT: v_mov_b32_e32 v6, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX1100-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5] +; GFX1100-NEXT: v_and_b32_e32 v5, 1, v6 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2] +; GFX1100-NEXT: v_mov_b32_e32 v1, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -863,11 +863,12 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v6, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3 +; GFX1100-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v6, v4, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_rhs_mask_small: @@ -1807,10 +1808,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[4:5] +; GFX1100-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,10 +1818,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[1:2] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[4:5] +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; @@ -1833,10 +1832,9 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[4:5] +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2126,23 +2124,21 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX1100-LABEL: lshr_mad_i64_negative_4: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1] +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[2:3] +; GFX1100-NEXT: v_mad_u64_u32 v[4:5], null, v3, v3, v[1:2] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: v_mov_b32_e32 v1, v4 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: lshr_mad_i64_negative_4: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX1150-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mov_b32_e32 v0, v4 -; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1150-NEXT: v_mov_b32_e32 v0, v3 +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v1, v[1:2] +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v2, v2, v[1:2] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: lshr_mad_i64_negative_4: @@ -2152,12 +2148,10 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v1, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v2, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: lshr_mad_i64_negative_4: diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir index 8b2c5887c97ee..5095ad021fde3 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr0 S_NOP 0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg body: | bb.0: liveins: $vgpr0 - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr1 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def $vgpr1 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index d29847e40dc8b..4681d589ac217 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2989,34 +2989,33 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c ; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s12, v0, 0 ; VI-NEXT: s_mul_i32 s4, s12, s11 -; VI-NEXT: v_mov_b32_e32 v6, s12 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0 -; VI-NEXT: s_mul_i32 s6, s13, s10 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s14, v8, v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, v6 -; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v6, v[4:5] +; VI-NEXT: v_mov_b32_e32 v5, s12 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: s_mul_i32 s4, s13, s10 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v5, 0 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s14, v7, v[3:4] +; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s9, v5, v[1:2] +; VI-NEXT: v_mov_b32_e32 v7, s13 ; VI-NEXT: s_mul_i32 s6, s15, s8 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v9 -; VI-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s9, v6, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s8, v7, v[1:2] +; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_addc_u32_e64 v5, s[4:5], 0, 0, vcc +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s9, v7, v[4:5] ; VI-NEXT: s_mul_i32 s6, s14, s9 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v8 +; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v3 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -3370,67 +3369,66 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; VI-NEXT: v_mov_b32_e32 v10, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[11:12] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v3, v4, v3 -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v4, v2, 0 ; VI-NEXT: v_mul_lo_u32 v2, v5, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] -; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v6, v0, v[14:15] -; VI-NEXT: v_mov_b32_e32 v9, v2 -; VI-NEXT: v_mul_lo_u32 v2, v7, v0 -; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; VI-NEXT: v_mul_lo_u32 v10, v7, v0 +; VI-NEXT: v_mad_u64_u32 v[7:8], s[0:1], v0, v4, 0 +; VI-NEXT: v_add_u32_e32 v3, vcc, v14, v3 +; VI-NEXT: v_add_u32_e32 v14, vcc, v3, v2 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[8:9] +; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v6, v0, v[13:14] +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, v[8:9] ; VI-NEXT: v_mul_lo_u32 v4, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v2, v15 -; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v10 +; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v14 +; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v9 ; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3] ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v6 -; VI-NEXT: v_add_u32_e32 v10, vcc, v0, v14 -; VI-NEXT: v_addc_u32_e32 v11, vcc, v1, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: v_add_u32_e32 v9, vcc, v0, v13 +; VI-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc +; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v12, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-NEXT: v_add3_u32 v9, v9, v12, v10 -; GFX9-NEXT: v_mul_lo_u32 v15, v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[10:11] -; GFX9-NEXT: v_mul_lo_u32 v10, v7, v0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, v0, v[8:9] -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v13, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[8:9] -; GFX9-NEXT: v_add3_u32 v5, v10, v7, v15 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc -; GFX9-NEXT: global_store_dwordx4 v14, v[2:5], s[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 +; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[9:10] +; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 +; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v10 +; GFX9-NEXT: v_mul_lo_u32 v14, v7, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] +; GFX9-NEXT: v_add3_u32 v3, v14, v3, v4 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: @@ -3468,37 +3466,36 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_lshlrev_b32 v17, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v17, s[0:1] -; GFX11-NEXT: global_load_b128 v[4:7], v17, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 -; GFX11-NEXT: v_mul_lo_u32 v18, v5, v2 -; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 -; GFX11-NEXT: v_mad_u64_u32 v[15:16], null, v4, v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_add3_u32 v16, v16, v3, v18 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v1, v4, v[11:12] -; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v11, v13 +; GFX11-NEXT: v_mul_lo_u32 v16, v5, v2 +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v4, v2, 0 +; GFX11-NEXT: v_mul_lo_u32 v17, v6, v1 +; GFX11-NEXT: v_mul_lo_u32 v18, v7, v0 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12] -; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 -; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v0, v[15:16] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v9, v11 +; GFX11-NEXT: v_mul_lo_u32 v11, v4, v3 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, v[9:10] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add3_u32 v0, v12, v11, v4 -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v14, v14, v11, v16 +; GFX11-NEXT: v_add_co_u32 v3, s0, v12, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, 0, s0 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v6, v0, v[13:14] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[3:4] +; GFX11-NEXT: v_add3_u32 v0, v18, v10, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v9 ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, v7, v0, vcc_lo -; GFX11-NEXT: global_store_b128 v17, v[8:11], s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v9, v2 +; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 34dd69f966637..a6de45a9b8a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -11,10 +11,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908: bb.0 (%ir-block.0): ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} - ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %25 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def %25 ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def %27 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2818058 /* regdef:VReg_64 */, def %27 ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) @@ -38,10 +38,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef renamable $agpr0 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7012362 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2818058 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -62,10 +62,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A: bb.0 (%ir-block.0): ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %23 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def %21 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def %21 ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21 ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -83,10 +83,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: {{ $}} ; PEI-GFX90A-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 ; PEI-GFX90A-NEXT: frame-setup CFI_INSTRUCTION undefined $pc_reg - ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, undef renamable $agpr0 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index a97ef058ce5fa..db8908bcbac67 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -43,17 +43,17 @@ machineFunctionInfo: body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:AGPR_32 */, implicit-def $agpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1638410 /* regdef:AGPR_32 */, implicit-def $agpr0 %14:vgpr_32 = COPY killed $agpr0 - INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39714826 /* regdef:VReg_512 */, def %7, 19136522 /* regdef:VReg_256 */, def %8, 7602186 /* regdef:VReg_128 */, def %9, 5636106 /* regdef:VReg_96 */, def %10, 5636106 /* regdef:VReg_96 */, def %11 + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39125002 /* regdef:VReg_512 */, def %7, 18546698 /* regdef:VReg_256 */, def %8, 7012362 /* regdef:VReg_128 */, def %9, 5046282 /* regdef:VReg_96 */, def %10, 5046282 /* regdef:VReg_96 */, def %11 INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39714825 /* reguse:VReg_512 */, %7 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19136521 /* reguse:VReg_256 */, %8 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, %9 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %10 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %11 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39125001 /* reguse:VReg_512 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 18546697 /* reguse:VReg_256 */, %8 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7012361 /* reguse:VReg_128 */, %9 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_96 */, %10 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_96 */, %11 $agpr1 = COPY %14 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, killed $agpr1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, killed $agpr1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 8abbdad893819..bbc04aa46adc5 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -203,28 +203,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v18, v13, v5 +; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mul_lo_u32 v9, v11, v4 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v22, v13, v[14:15] -; GFX9-NEXT: v_add3_u32 v8, v8, v18, v9 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v23, v[7:8] -; GFX9-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v23, v11, v[14:15] +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[6:7] +; GFX9-NEXT: v_mul_lo_u32 v15, v11, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22 -; GFX9-NEXT: v_mul_lo_u32 v12, v10, v23 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v17, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v22, v11, v[9:10] -; GFX9-NEXT: v_add3_u32 v4, v12, v7, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v23, v11, v[6:7] +; GFX9-NEXT: v_add3_u32 v9, v9, v16, v15 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v23, v[8:9] +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mul_lo_u32 v15, v10, v23 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v11, v[12:13] +; GFX9-NEXT: v_add3_u32 v4, v15, v9, v4 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v10, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v20 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v21 @@ -1590,25 +1589,24 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mul_lo_u32 v18, v13, v6 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, 0 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v12, v[16:17] -; GFX9-NEXT: v_add3_u32 v10, v10, v19, v18 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v4, v[9:10] -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v5, v12, v[8:9] +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, v[8:9] +; GFX9-NEXT: v_add3_u32 v11, v11, v19, v18 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v4, v[10:11] ; GFX9-NEXT: v_mul_lo_u32 v6, v14, v5 -; GFX9-NEXT: v_mul_lo_u32 v14, v15, v4 -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[11:12] -; GFX9-NEXT: v_add3_u32 v6, v14, v9, v6 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, v17, v9 +; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v13, v[14:15] +; GFX9-NEXT: v_add3_u32 v6, v12, v11, v6 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v6, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir index 35be513c784bb..8ac85fa9c41a2 100644 --- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir @@ -73,7 +73,7 @@ body: | # (1) %0.sub0 + %0.sub0 and (2) %0.sub1 + %0.sub1 # Check that renaming (2) does not inadvertently rename (1). # CHECK-LABEL: name: test2 -# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5) +# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0, 1245194 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5) name: test2 body: | bb.0: @@ -81,7 +81,7 @@ body: | bb.1: undef %0.sub1:vreg_64 = V_ALIGNBIT_B32_e64 %0.sub0:vreg_64, %0.sub0:vreg_64, 16, implicit $exec - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1835018 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1245194 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5) S_BRANCH %bb.1 ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir index 39f64185b9d57..cdd68630bf4ff 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir @@ -43,7 +43,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]] ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 %1:av_64_align2 = COPY $vgpr0_vgpr1 @@ -51,7 +51,7 @@ body: | %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_128_align2 = COPY %3 %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64_Align2 */, %5 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir index c764bc17f0631..d7b713aa53b86 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir @@ -19,7 +19,7 @@ body: | ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -30,7 +30,7 @@ body: | %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -172,7 +172,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -183,7 +183,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -208,7 +208,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -219,7 +219,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub1:areg_128_align2 = COPY %4.sub2 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir index c0f5a7737afb0..3f61c3dbfaf37 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir @@ -17,7 +17,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -26,7 +26,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -47,7 +47,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -56,7 +56,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -79,7 +79,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -90,7 +90,7 @@ body: | %other_use:vreg_64_align2 = COPY %4.sub0_sub1 %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %6:areg_64_align2 = COPY %5 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %6:areg_64_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %6:areg_64_align2 GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... @@ -114,7 +114,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -126,7 +126,7 @@ body: | undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %7:areg_64_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64_Align2 */, %7 GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN @@ -151,7 +151,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -163,7 +163,7 @@ body: | %other_use:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %8.sub0_sub1:areg_128_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -189,7 +189,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -202,7 +202,7 @@ body: | %other_use1:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %8:agpr_32 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, %8:agpr_32 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1638409 /* reguse:AGPR_32 */, %8:agpr_32 GLOBAL_STORE_DWORD %0, %8, 0, 0, implicit $exec :: (store (s32), addrspace 1) SI_RETURN @@ -231,7 +231,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -245,7 +245,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -273,7 +273,7 @@ body: | ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -287,7 +287,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_64 = COPY %4.sub1_sub2 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, %6:areg_64 GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... @@ -313,7 +313,7 @@ body: | ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -327,7 +327,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_64 = COPY %4.sub1_sub2 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:AReg_64 */, %6:areg_64 GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 52ef811875f88..a6c019bf374d7 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -18,7 +18,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -40,7 +40,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -86,7 +86,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -108,7 +108,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index af882c06e1b4e..b754a6b897159 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -37,7 +37,7 @@ body: | ; CHECK-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[COPY1]], 1835018 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1835017 /* reguse:VGPR_32 */, [[COPY1]], 1835017 /* reguse:VGPR_32 */, [[COPY]].sub1 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[COPY1]], 1245194 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1245193 /* reguse:VGPR_32 */, [[COPY1]], 1245193 /* reguse:VGPR_32 */, [[COPY]].sub1 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3 ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]] @@ -63,7 +63,7 @@ body: | undef %11.sub0:vreg_512 = COPY %4.sub0 %12:vgpr_32 = COPY %4.sub0 %11.sub1:vreg_512 = COPY %4.sub1 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1835017 /* reguse:VGPR_32 */, %12:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4.sub1:vreg_512 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1245194 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1245193 /* reguse:VGPR_32 */, %12:vgpr_32, 1245193 /* reguse:VGPR_32 */, %4.sub1:vreg_512 %11.sub2:vreg_512 = COPY undef %1 %11.sub3:vreg_512 = COPY %4.sub3 %11.sub5:vreg_512 = COPY undef %1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 3e8e1878e0be5..5edb9669d98eb 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -40,18 +40,18 @@ body: | ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %11 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %11 ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) - ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15, 1835018 /* regdef:VGPR_32 */, def %16 + ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15, 1245194 /* regdef:VGPR_32 */, def %16 ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec ; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec - ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21, 1835018 /* regdef:VGPR_32 */, def %22 + ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21, 1245194 /* regdef:VGPR_32 */, def %22 ; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]] ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) @@ -94,21 +94,21 @@ body: | %10:vgpr_32 = IMPLICIT_DEF bb.1: - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11:vgpr_32 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %11:vgpr_32 GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, implicit $exec :: (store (s32), addrspace 1) %13:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) - INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %16:vgpr_32 + INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %16:vgpr_32 %17:vgpr_32 = DS_READ_B32_gfx9 %6, 0, 0, implicit $exec %18:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec %19:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec - INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32 + INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21:vgpr_32, 1245194 /* regdef:VGPR_32 */, def %22:vgpr_32 %23:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %5.sub1:vreg_64 = COPY %6 %25:vgpr_32 = V_ADD_U32_e32 1, %10, implicit $exec %26:sreg_64_xexec = V_CMP_GT_U32_e64 64, %25, implicit $exec %27:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1835017 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, %18, 1835017 /* reguse:VGPR_32 */, %17, 1835017 /* reguse:VGPR_32 */, %23, 1835017 /* reguse:VGPR_32 */, %19 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1245194 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1245193 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, %18, 1245193 /* reguse:VGPR_32 */, %17, 1245193 /* reguse:VGPR_32 */, %23, 1245193 /* reguse:VGPR_32 */, %19 DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll index 9b3dc7f531021..287d1dde21403 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2f32_v2f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2f32_v2f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2f32_v2f32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2f32_v2f32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2f32_v2f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2f32_v2f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2f32_v2f32__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2f32_v2f32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2f32_v2f32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 34043cd067b25..d5998e289c09d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2f32_v3f32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2f32_v3f32__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2f32_v3f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2f32_v3f32__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2f32_v3f32__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2f32_v3f32__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2f32_v3f32__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2f32_v3f32__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2f32_v3f32__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2f32_v3f32__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2f32_v3f32__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2f32_v3f32__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2f32_v3f32__5_3() { } define void @s_shuffle_v2f32_v3f32__5_4() { -; GFX9-LABEL: s_shuffle_v2f32_v3f32__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=s"() %vec1 = call <3 x float> asm "; def $0", "=s"() %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2f32_v3f32__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2f32_v3f32__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2f32_v3f32__1_1() { } define void @s_shuffle_v2f32_v3f32__2_1() { -; GFX9-LABEL: s_shuffle_v2f32_v3f32__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=s"() %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x float> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2f32_v3f32__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2f32_v3f32__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2f32_v3f32__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll index 676a521757bd8..a86ca0a4a23c6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2i32_v2i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2i32_v2i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2i32_v2i32__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2i32_v2i32__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2i32_v2i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2i32_v2i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2i32_v2i32__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2i32_v2i32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2i32_v2i32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index f65340470feb1..d46ca61cff64d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2i32_v3i32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2i32_v3i32__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2i32_v3i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2i32_v3i32__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2i32_v3i32__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2i32_v3i32__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2i32_v3i32__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2i32_v3i32__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2i32_v3i32__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2i32_v3i32__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2i32_v3i32__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2i32_v3i32__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2i32_v3i32__5_3() { } define void @s_shuffle_v2i32_v3i32__5_4() { -; GFX9-LABEL: s_shuffle_v2i32_v3i32__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=s"() %vec1 = call <3 x i32> asm "; def $0", "=s"() %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2i32_v3i32__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2i32_v3i32__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2i32_v3i32__1_1() { } define void @s_shuffle_v2i32_v3i32__2_1() { -; GFX9-LABEL: s_shuffle_v2i32_v3i32__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=s"() %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x i32> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2i32_v3i32__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2i32_v3i32__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2i32_v3i32__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll index 299dfba482953..02fb06ef54d42 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v2p3_v2p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v2p3_v2p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,16 +152,14 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +204,14 @@ define void @v_shuffle_v2p3_v2p3__3_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -304,12 +299,12 @@ define void @v_shuffle_v2p3_v2p3__3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -347,12 +342,11 @@ define void @v_shuffle_v2p3_v2p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -473,12 +467,11 @@ define void @v_shuffle_v2p3_v2p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -593,12 +586,12 @@ define void @v_shuffle_v2p3_v2p3__1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -724,12 +717,11 @@ define void @v_shuffle_v2p3_v2p3__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -874,11 +866,11 @@ define void @v_shuffle_v2p3_v2p3__1_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index 13e3d94c35446..d0f00f8363aed 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -99,36 +99,33 @@ define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -194,36 +191,33 @@ define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -237,49 +231,45 @@ define void @v_shuffle_v2p3_v3p3__5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -293,46 +283,43 @@ define void @v_shuffle_v2p3_v3p3__5_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -346,15 +333,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -362,15 +348,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -378,15 +363,14 @@ define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -400,13 +384,12 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -445,36 +428,37 @@ define void @v_shuffle_v2p3_v3p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -488,38 +472,37 @@ define void @v_shuffle_v2p3_v3p3__5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -533,12 +516,11 @@ define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -659,13 +641,12 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -703,12 +684,11 @@ define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v0, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,36 +898,37 @@ define void @v_shuffle_v2p3_v3p3__2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1178,38 +1159,37 @@ define void @v_shuffle_v2p3_v3p3__2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1408,36 +1388,33 @@ define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v3, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_3: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v3, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1654,42 +1631,42 @@ define void @v_shuffle_v2p3_v3p3__2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1939,43 +1916,42 @@ define void @v_shuffle_v2p3_v3p3__2_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v6 +; GFX942-NEXT: global_store_dwordx2 v7, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2175,7 +2151,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2187,7 +2163,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2199,7 +2175,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2274,7 +2250,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2286,7 +2262,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2298,7 +2274,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2317,11 +2293,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s8 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2333,11 +2309,11 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s8 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2347,13 +2323,13 @@ define void @s_shuffle_v2p3_v3p3__5_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s4 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2372,10 +2348,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2387,10 +2364,11 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2400,12 +2378,13 @@ define void @s_shuffle_v2p3_v3p3__5_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2424,11 +2403,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2440,11 +2419,11 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2454,13 +2433,13 @@ define void @s_shuffle_v2p3_v3p3__5_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s6 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2479,8 +2458,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2492,8 +2471,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2505,8 +2484,8 @@ define void @s_shuffle_v2p3_v3p3__5_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2519,17 +2498,44 @@ define void @s_shuffle_v2p3_v3p3__5_3() { } define void @s_shuffle_v2p3_v3p3__5_4() { -; GFX9-LABEL: s_shuffle_v2p3_v3p3__5_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> @@ -2544,8 +2550,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2557,8 +2563,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2570,8 +2576,8 @@ define void @s_shuffle_v2p3_v3p3__5_5() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2695,8 +2701,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -2708,8 +2714,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -2721,8 +2727,8 @@ define void @s_shuffle_v2p3_v3p3__2_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s0 +; GFX942-NEXT: s_mov_b32 s3, s0 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -2929,17 +2935,44 @@ define void @s_shuffle_v2p3_v3p3__1_1() { } define void @s_shuffle_v2p3_v3p3__2_1() { -; GFX9-LABEL: s_shuffle_v2p3_v3p3__2_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:10] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:9] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s1 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; use s[8:9] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:9]}"(<2 x ptr addrspace(3)> %shuf) @@ -3150,8 +3183,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3163,8 +3196,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3176,8 +3209,8 @@ define void @s_shuffle_v2p3_v3p3__2_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s2 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3388,7 +3421,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3400,7 +3433,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3412,7 +3445,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3635,10 +3668,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s9 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3650,10 +3684,11 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s9 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3663,12 +3698,13 @@ define void @s_shuffle_v2p3_v3p3__2_4() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:10] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s5 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND @@ -3897,11 +3933,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:10] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s7, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s6 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -3913,11 +3949,11 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:10] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s7, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -3927,13 +3963,13 @@ define void @s_shuffle_v2p3_v3p3__2_5() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:2] +; GFX942-NEXT: ; def s[4:6] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s3, s6 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:6] +; GFX942-NEXT: ; def s[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s2 -; GFX942-NEXT: s_mov_b32 s9, s6 +; GFX942-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:9] ; GFX942-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index 430f64164d24f..35cf10f1135c9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3f32_v2f32__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3f32_v2f32__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3f32_v2f32__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3f32_v2f32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3f32_v2f32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3f32_v2f32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3f32_v2f32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3f32_v2f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3f32_v2f32__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3f32_v2f32__3_3_u() { } define void @s_shuffle_v3f32_v2f32__3_3_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3f32_v2f32__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() - %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) - ret void -} - -define void @s_shuffle_v3f32_v2f32__3_3_1() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x float> asm "; def $0", "=s"() - %vec1 = call <2 x float> asm "; def $0", "=s"() - %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) ret void } define void @s_shuffle_v3f32_v2f32__3_3_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3f32_v2f32__3_3_3() { } define void @s_shuffle_v3f32_v2f32__u_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3f32_v2f32__0_0_0() { } define void @s_shuffle_v3f32_v2f32__1_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3f32_v2f32__1_0_0() { } define void @s_shuffle_v3f32_v2f32__2_0_0() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x float> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3f32_v2f32__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3f32_v2f32__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3f32_v2f32__2_2_2() { } define void @s_shuffle_v3f32_v2f32__3_2_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3f32_v2f32__3_2_2() { } define void @s_shuffle_v3f32_v2f32__3_u_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3f32_v2f32__3_0_2() { } define void @s_shuffle_v3f32_v2f32__3_1_2() { -; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3f32_v2f32__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=s"() %vec1 = call <2 x float> asm "; def $0", "=s"() %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index ef670e963bdb6..befc1126d6fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3f32_v3f32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3f32_v3f32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3f32_v3f32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3f32_v3f32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3f32_v3f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3f32_v3f32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3f32_v3f32__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3f32_v3f32__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3f32_v3f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3f32_v3f32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3f32_v3f32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3f32_v3f32__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3f32_v3f32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3f32_v3f32__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3f32_v3f32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3f32_v3f32__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3f32_v3f32__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3f32_v3f32__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index ea4fac3b1d2b1..51d45922893b3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3i32_v2i32__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3i32_v2i32__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3i32_v2i32__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3i32_v2i32__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3i32_v2i32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3i32_v2i32__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3i32_v2i32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3i32_v2i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3i32_v2i32__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3i32_v2i32__3_3_u() { } define void @s_shuffle_v3i32_v2i32__3_3_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3i32_v2i32__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() - %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) - ret void -} - -define void @s_shuffle_v3i32_v2i32__3_3_1() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x i32> asm "; def $0", "=s"() - %vec1 = call <2 x i32> asm "; def $0", "=s"() - %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) ret void } define void @s_shuffle_v3i32_v2i32__3_3_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3i32_v2i32__3_3_3() { } define void @s_shuffle_v3i32_v2i32__u_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3i32_v2i32__0_0_0() { } define void @s_shuffle_v3i32_v2i32__1_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3i32_v2i32__1_0_0() { } define void @s_shuffle_v3i32_v2i32__2_0_0() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x i32> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3i32_v2i32__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3i32_v2i32__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3i32_v2i32__2_2_2() { } define void @s_shuffle_v3i32_v2i32__3_2_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3i32_v2i32__3_2_2() { } define void @s_shuffle_v3i32_v2i32__3_u_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3i32_v2i32__3_0_2() { } define void @s_shuffle_v3i32_v2i32__3_1_2() { -; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3i32_v2i32__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=s"() %vec1 = call <2 x i32> asm "; def $0", "=s"() %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index 7061c13b28d03..89e6a2918a68c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3i32_v3i32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3i32_v3i32__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3i32_v3i32__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3i32_v3i32__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3i32_v3i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3i32_v3i32__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3i32_v3i32__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3i32_v3i32__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3i32_v3i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3i32_v3i32__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3i32_v3i32__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3i32_v3i32__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3i32_v3i32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3i32_v3i32__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3i32_v3i32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3i32_v3i32__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3i32_v3i32__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3i32_v3i32__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index bd0100a4ffdb5..25e087bd922ac 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -111,12 +110,11 @@ define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -154,15 +152,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -208,15 +205,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -263,10 +259,10 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -307,12 +303,12 @@ define void @v_shuffle_v3p3_v2p3__3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -350,15 +346,15 @@ define void @v_shuffle_v3p3_v2p3__3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,14 +399,14 @@ define void @v_shuffle_v3p3_v2p3__3_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +454,11 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -503,13 +499,13 @@ define void @v_shuffle_v3p3_v2p3__3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -561,26 +557,25 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -638,13 +633,13 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_0_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -696,26 +691,25 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v2 +; GFX942-NEXT: global_store_dwordx3 v0, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -729,15 +723,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -784,15 +777,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -836,16 +828,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -891,16 +882,15 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,13 +1023,13 @@ define void @v_shuffle_v3p3_v2p3__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1121,16 +1111,15 @@ define void @v_shuffle_v3p3_v2p3__3_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,15 +1165,14 @@ define void @v_shuffle_v3p3_v2p3__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1230,15 +1218,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,13 +1276,12 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,12 +1379,11 @@ define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,13 +1431,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,13 +1478,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1539,30 +1523,29 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] -; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1570,16 +1553,16 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] -; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: ;;#ASMSTART +; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -1595,14 +1578,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1746,15 +1729,14 @@ define void @v_shuffle_v3p3_v2p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1844,12 +1826,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1890,15 +1872,14 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1947,15 +1928,14 @@ define void @v_shuffle_v3p3_v2p3__3_1_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2001,12 +1981,13 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2370,7 +2351,29 @@ define void @s_shuffle_v3p3_v2p3__3_3_u() { } define void @s_shuffle_v3p3_v2p3__3_3_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART @@ -2380,13 +2383,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART @@ -2396,13 +2399,13 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART @@ -2412,115 +2415,31 @@ define void @s_shuffle_v3p3_v2p3__3_3_0() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s10, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> - call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) - ret void -} - -define void @s_shuffle_v3p3_v2p3__3_3_1() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 -; GFX900-NEXT: s_mov_b32 s10, s5 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 -; GFX90A-NEXT: s_mov_b32 s10, s5 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_1: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s9 -; GFX942-NEXT: s_mov_b32 s10, s1 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() - %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) ret void } define void @s_shuffle_v3p3_v2p3__3_3_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_3_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -2549,44 +2468,17 @@ define void @s_shuffle_v3p3_v2p3__3_3_3() { } define void @s_shuffle_v3p3_v2p3__u_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__u_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2613,47 +2505,18 @@ define void @s_shuffle_v3p3_v2p3__0_0_0() { } define void @s_shuffle_v3p3_v2p3__1_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__1_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__1_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2661,44 +2524,17 @@ define void @s_shuffle_v3p3_v2p3__1_0_0() { } define void @s_shuffle_v3p3_v2p3__2_0_0() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__2_0_0: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> call void asm sideeffect "; use $0", "{s[8:10]}"(<3 x ptr addrspace(3)> %shuf) @@ -2710,14 +2546,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2727,14 +2562,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2744,14 +2578,13 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s10 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2768,13 +2601,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2784,13 +2616,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2800,13 +2631,12 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2823,14 +2653,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2840,14 +2669,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2857,14 +2685,13 @@ define void @s_shuffle_v3p3_v2p3__3_1_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s11 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -2881,14 +2708,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 -; GFX900-NEXT: s_mov_b32 s9, s6 -; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -2898,14 +2724,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 -; GFX90A-NEXT: s_mov_b32 s9, s6 -; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -2915,14 +2740,13 @@ define void @s_shuffle_v3p3_v2p3__3_2_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 -; GFX942-NEXT: s_mov_b32 s9, s2 -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: s_mov_b32 s8, s1 +; GFX942-NEXT: s_mov_b32 s9, s0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3344,47 +3168,18 @@ define void @s_shuffle_v3p3_v2p3__2_2_2() { } define void @s_shuffle_v3p3_v2p3__3_2_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_2_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: s_mov_b32 s9, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -3393,44 +3188,17 @@ define void @s_shuffle_v3p3_v2p3__3_2_2() { } define void @s_shuffle_v3p3_v2p3__3_u_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> @@ -3446,11 +3214,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX900-NEXT: ; def s[4:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ; def s[10:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: s_mov_b32 s9, s4 -; GFX900-NEXT: s_mov_b32 s10, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] ; GFX900-NEXT: ;;#ASMEND @@ -3463,11 +3230,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX90A-NEXT: ; def s[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ; def s[10:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: s_mov_b32 s9, s4 -; GFX90A-NEXT: s_mov_b32 s10, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] ; GFX90A-NEXT: ;;#ASMEND @@ -3480,11 +3246,10 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { ; GFX942-NEXT: ; def s[0:1] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[2:3] +; GFX942-NEXT: ; def s[10:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s3 +; GFX942-NEXT: s_mov_b32 s8, s11 ; GFX942-NEXT: s_mov_b32 s9, s0 -; GFX942-NEXT: s_mov_b32 s10, s2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:10] ; GFX942-NEXT: ;;#ASMEND @@ -3497,53 +3262,20 @@ define void @s_shuffle_v3p3_v2p3__3_0_2() { } define void @s_shuffle_v3p3_v2p3__3_1_2() { -; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:9] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:5] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s5 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:10] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:9] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:5] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s5 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:10] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_1_2: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:9] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:1] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s1 -; GFX942-NEXT: s_mov_b32 s10, s0 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:10] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v3p3_v2p3__3_1_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:9] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s8, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:10] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index cecd2a0e4b015..62b9da9fedb95 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -58,12 +58,11 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -153,12 +152,11 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -240,15 +238,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -355,9 +352,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -403,13 +399,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -495,9 +490,8 @@ define void @v_shuffle_v3p3_v3p3__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -540,14 +534,12 @@ define void @v_shuffle_v3p3_v3p3__5_5_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -597,16 +589,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -656,15 +646,14 @@ define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -760,14 +749,13 @@ define void @v_shuffle_v3p3_v3p3__5_5_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -813,10 +801,10 @@ define void @v_shuffle_v3p3_v3p3__5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -857,12 +845,11 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -994,13 +981,12 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1041,12 +1027,11 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1088,14 +1073,12 @@ define void @v_shuffle_v3p3_v3p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1145,16 +1128,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1376,16 +1358,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1574,10 +1555,10 @@ define void @v_shuffle_v3p3_v3p3__1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1704,15 +1685,14 @@ define void @v_shuffle_v3p3_v3p3__4_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,15 +1796,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,16 +1851,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1931,16 +1908,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1989,16 +1965,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2100,12 +2075,12 @@ define void @v_shuffle_v3p3_v3p3__u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2184,13 +2159,12 @@ define void @v_shuffle_v3p3_v3p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2232,10 +2206,10 @@ define void @v_shuffle_v3p3_v3p3__2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2274,12 +2248,12 @@ define void @v_shuffle_v3p3_v3p3__3_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2317,15 +2291,14 @@ define void @v_shuffle_v3p3_v3p3__4_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2374,13 +2347,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,16 +2453,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2591,15 +2563,15 @@ define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2751,12 +2723,11 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2895,13 +2866,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2988,16 +2958,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3103,10 +3071,8 @@ define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3306,9 +3272,8 @@ define void @v_shuffle_v3p3_v3p3__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v3 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3456,10 +3421,10 @@ define void @v_shuffle_v3p3_v3p3__4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3545,13 +3510,12 @@ define void @v_shuffle_v3p3_v3p3__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3592,15 +3556,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3705,14 +3669,13 @@ define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3760,13 +3723,12 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,12 +3769,12 @@ define void @v_shuffle_v3p3_v3p3__u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3913,9 +3875,9 @@ define void @v_shuffle_v3p3_v3p3__1_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3963,15 +3925,15 @@ define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v5 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4060,13 +4022,12 @@ define void @v_shuffle_v3p3_v3p3__4_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_5_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4150,15 +4111,15 @@ define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4262,15 +4223,15 @@ define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4316,14 +4277,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll index fa422e48bbce0..89ce868b03546 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4bf16_v3bf16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4bf16_v3bf16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll index ab297c02fe3b5..8e24d6e02f3ff 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4bf16_v4bf16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4bf16_v4bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4bf16_v4bf16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll index e91433ac4c1f7..d1ff8c658c77d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4f16_v3f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4f16_v3f16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4f16_v3f16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4f16_v3f16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4f16_v3f16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4f16_v3f16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll index 47100b9983559..8a9a0d1a7ef5d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4f16_v4f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4f16_v4f16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4f16_v4f16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4f16_v4f16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4f16_v4f16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index 7c8417837f788..5828e40595f9f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll index 7b3a5a879f44f..1a7e281e7e138 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll @@ -103,12 +103,11 @@ define void @v_shuffle_v4i16_v3i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -203,12 +202,11 @@ define void @v_shuffle_v4i16_v3i16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -567,16 +565,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -744,14 +741,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1606,16 +1602,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2639,16 +2634,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3017,16 +3011,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3077,16 +3070,15 @@ define void @v_shuffle_v4i16_v3i16__5_5_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3697,16 +3689,15 @@ define void @v_shuffle_v4i16_v3i16__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4459,12 +4450,11 @@ define void @v_shuffle_v4i16_v3i16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,14 +5972,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6223,14 +6212,13 @@ define void @v_shuffle_v4i16_v3i16__5_5_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll index 2a371b7c7d2d3..05ebf49b997eb 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll @@ -100,12 +100,11 @@ define void @v_shuffle_v4i16_v4i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -238,12 +237,11 @@ define void @v_shuffle_v4i16_v4i16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -721,16 +719,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -950,14 +947,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3870,16 +3866,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3928,16 +3923,15 @@ define void @v_shuffle_v4i16_v4i16__7_7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:2] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7088,12 +7082,11 @@ define void @v_shuffle_v4i16_v4i16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v2, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9087,14 +9080,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9377,14 +9369,13 @@ define void @v_shuffle_v4i16_v4i16__7_7_4_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index f7149350e74d3..3a659e1753e97 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index aa9e23b971823..f1c1e4b20f242 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -3985,12 +3985,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,11 +4003,10 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4105,12 +4103,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4124,12 +4121,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[4:5], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -6709,16 +6705,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6726,17 +6722,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6871,28 +6867,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[0:1] op_sel:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[2:3] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index b13829e0351f6..8ecc0ad65a944 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec ; GCN-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %14.sub0 + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %14.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:VReg_64 */, %14 + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2818057 /* reguse:VReg_64 */, %14 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir index da6b57c776796..2fd1e36c4181e 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir @@ -28,9 +28,9 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: $sgpr10 = S_MOV_B32 -1 ; CHECK-NEXT: S_BRANCH %bb.1 @@ -41,9 +41,9 @@ body: | bb.1: %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def %0.sub1 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0, 1245194 /* regdef:VGPR_32 */, def %0.sub1 S_NOP 0, implicit %0.sub1 $sgpr10 = S_MOV_B32 -1 S_BRANCH %bb.1 @@ -69,9 +69,9 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1245194 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0 ; CHECK-NEXT: S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1 ; CHECK-NEXT: $sgpr10 = S_MOV_B32 -1 ; CHECK-NEXT: S_BRANCH %bb.1 @@ -82,9 +82,9 @@ body: | bb.1: %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) - INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0.sub1, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3) + INLINEASM &"", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, %2 + INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %0.sub1, 1245194 /* regdef:VGPR_32 */, def undef %0.sub0 S_NOP 0, implicit %0.sub1 $sgpr10 = S_MOV_B32 -1 S_BRANCH %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 1b4ed67eb6eea..94448411cfd0e 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -2625,12 +2625,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2653,12 +2652,11 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v3, v[8:9] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v6, v2, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2703,13 +2701,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v3i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v3, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v2, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v8, v5, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v0, v2, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v0, v3, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v7, v4, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v2, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v7, v5, v[1:2] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[5:6] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2731,16 +2728,16 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v3i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v6, v2, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v3, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v7, v2, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v5, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v4, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v0, v2, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v0, v3, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v4, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v2, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v7, v5, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v10, v4, v[8:9] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2810,18 +2807,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2847,18 +2842,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v0, v4, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v7, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v0, v5, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v9, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[15:16] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v17, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v9, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2915,19 +2908,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v2, v6, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v2, v7, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v10, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v8, v5, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[14:15] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v4, v[7:8] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v2, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v10, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v0, v4, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v7, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v0, v5, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v11, v9, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v4, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v11, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v9, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2953,23 +2943,19 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v4i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v2, v6, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v8, v4, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v2, v7, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v8, v5, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v3, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v4, v[15:16] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v2, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v0, v4, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v2, v7, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v0, v5, v[12:13] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v11, v9, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v7, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v10, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v3, v6, v[13:14] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v4, v[14:15] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v11, v15, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -3070,29 +3056,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3139,29 +3122,26 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3266,34 +3246,27 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v0, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v2, v10, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v16, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[21:22], s4, v6, v14, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[23:24], s4, v4, v12, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v18 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v11, v[18:19] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v24 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[29:30], s4, v4, v13, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v6, v15, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v19, v21, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v17, v23, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v10, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v14, v[26:27] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v28 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v5, v12, v[29:30] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v31 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v19, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v16, v8, v[25:26] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v30, v27, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v3, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v2, v21, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v23, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v30, v4, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v27, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v6, v14, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v0, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v10, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v12, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v15, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[19:20] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v2, v11, v[21:22] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v4, v13, v[23:24] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v7, v14, v[24:25] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v20, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v3, v10, v[26:27] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v12, v[27:28] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v22, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v20, v13, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v1, v8, v[25:26] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v18, v4, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v9, v16, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v22, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v6, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3337,39 +3310,34 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v0, v8, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[19:20], null, v2, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[21:22], null, v6, v14, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[23:24], null, v4, v12, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_mov_b32 v1, v18 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[1:2] -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, v22 :: v_dual_mov_b32 v1, v24 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[18:19] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v6, v15, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[30:31], null, v4, v13, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v19, v21, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v17, v23, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v14, v[27:28] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[26:27] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v12, v[30:31] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v29 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, v32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v8, v[25:26] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v19, v0, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v31, v28, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v17, v2, v[3:4] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v21, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v9, v23, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v31, v2, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v28, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v6, v14, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[18:19], null, v0, v8, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[20:21], null, v2, v10, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[22:23], null, v4, v12, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[24:25], null, v6, v15, v[17:18] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[19:20] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[21:22] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v4, v13, v[23:24] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v7, v14, v[24:25] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v20, v16, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v10, v[26:27] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v5, v12, v[27:28] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v18, v22, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v20, v28, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v1, v8, v[25:26] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v6, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v18, v9, v[3:4] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v16, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v22, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v8, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3551,60 +3519,49 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] +; GFX7-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3696,60 +3653,49 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] +; GFX8-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3956,66 +3902,53 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v0, v16, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v2, v18, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v6, v22, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v0, v17, v[32:33] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v4, v20, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v34 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v1, v16, v[35:36] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v19, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v37 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v39 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v8, v24, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v4, v21, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[49:50], s4, v10, v26, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v23, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v35 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v20, v[48:49] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[51:52], s4, v8, v25, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v50 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v10, v27, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[53:54], s4, v12, v28, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v9, v24, v[51:52] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v54 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v11, v26, v[52:53] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v36, v53, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[54:55], s4, v12, v29, v[2:3] -; GFX10-GISEL-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v18, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v14, v30, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v33, v49, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v22, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v31, v34, 0 +; GFX10-GISEL-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v6, v22, 0 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v14, v12, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v38, v3, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v13, v28, v[54:55] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v15, v30, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v18 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v31, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v33, v10, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v38, v14, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v17, v11, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v36, v13, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v7, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v3, v[14:15] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v21 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v49, v[9:10] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v53, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v17, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v20, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v4, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v16, v34, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v11, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v13, v19, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v20, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v14, v35, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v6, v23, v[34:35] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v33, v31, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v15, v30, v[36:37] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v0, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v7, v22, v[37:38] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v2, v18, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v20, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v33, v38, v[35:36] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v0, v17, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v2, v19, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v36, v31, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v8, v24, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v10, v26, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], s4, v1, v16, v[37:38] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v3, v18, v[38:39] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v28, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v4, v21, v[23:24] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v25, v[31:32] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v10, v27, v[33:34] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v12, v29, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v5, v20, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v9, v24, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v11, v26, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v32, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v13, v28, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v22, v0, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v6, v9, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v22, v10, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v34, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v14, v19, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v7, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v32, v[11:12] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v18, v0, v[12:13] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v15, v30, v[13:14] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v35, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v1, v11, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v8, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v34, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v12, v7, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v8, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -4096,66 +4029,62 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: scratch_load_b32 v71, off, s32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 +; GFX11-GISEL-NEXT: scratch_load_b32 v55, off, s32 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[33:34], null, v2, v18, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[35:36], null, v4, v20, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[37:38], null, v6, v22, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[50:51], null, v10, v26, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v2, v19, v[34:35] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[70:71], null, v0, v17, v[32:33] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v4, v21, v[36:37] ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[52:53], null, v12, v28, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[64:65], null, v14, v30, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[66:67], null, v33, v50, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[48:49], null, v8, v24, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[54:55], null, v14, v30, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v0, v17, v[32:33] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v2, v19, v[34:35] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v4, v21, v[36:37] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v6, v23, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v10, v27, v[51:52] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[65:66], null, v31, v48, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v6, v23, v[38:39] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v1, v16, v[70:71] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v18, v[82:83] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v10, v27, v[51:52] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v12, v29, v[53:54] ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[38:39], null, v8, v25, v[49:50] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v64, v55 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v12, v29, v[53:54] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[97:98], null, v1, v16, v[82:83] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v18, v[83:84] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v20, v[84:85] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v7, v22, v[85:86] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[67:68], null, v33, v50, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v54, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v66 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[69:70], null, v35, v52, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v64, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v5, v20, v[83:84] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[68:69], null, v35, v52, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v7, v22, v[84:85] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v24, v[38:39] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[86:87] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[53:54], null, v31, v48, 0 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v14, v71, v[64:65] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v24, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v11, v26, v[86:87] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[96:97] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v30, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v68 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v31, v5, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v81 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v33, v6, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v70 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v14, v55, v[65:66] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v15, v30, v[1:2] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v26, v[85:86] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v53, v68, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v37, v4, v[81:82] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v33, v1, v[67:68] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v35, v7, v[69:70] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v66, v80, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v17, v64, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v31, v6, v[54:55] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v0, v50, v[9:10] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v37, v8, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v67, v80, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v35, v7, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v65, v69, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v66, v11, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v16, v52, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v96, v48, v[7:8] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v53, v0, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v80, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v54, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v1, v50, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v52, v[13:14] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v67, v14, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v97, v48, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v11, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v65, v4, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v3, v80, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v69, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v6, v4, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v9, v11, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v5, v68, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v2, v9, v[1:2] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v7, v4, v[5:6] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v16i64: diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll index bc656bb785e9e..4fbb6a07aa37d 100644 --- a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll +++ b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll @@ -1,6 +1,6 @@ ; XFAIL: * ; FIXME: asserts -; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-gnu-linux -filetype=null \ +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-gnu-linux -filetype=null -enable-legalize-types-checking=0 \ ; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names %s define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) { diff --git a/llvm/test/CodeGen/PowerPC/milicode32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll index ddadd01a748f1..b69b997254d2c 100644 --- a/llvm/test/CodeGen/PowerPC/milicode32.ll +++ b/llvm/test/CodeGen/PowerPC/milicode32.ll @@ -68,7 +68,41 @@ entry: ret i32 %call } +define i32 @strlen_test_fp_strict(ptr noundef %str) nounwind { +; CHECK-AIX-32-P9-LABEL: strlen_test_fp_strict: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mflr r0 +; CHECK-AIX-32-P9-NEXT: stwu r1, -64(r1) +; CHECK-AIX-32-P9-NEXT: stw r0, 72(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, 60(r1) +; CHECK-AIX-32-P9-NEXT: bl .___strlen[PR] +; CHECK-AIX-32-P9-NEXT: nop +; CHECK-AIX-32-P9-NEXT: addi r1, r1, 64 +; CHECK-AIX-32-P9-NEXT: lwz r0, 8(r1) +; CHECK-AIX-32-P9-NEXT: mtlr r0 +; CHECK-AIX-32-P9-NEXT: blr +; +; CHECK-LINUX32-P9-LABEL: strlen_test_fp_strict: +; CHECK-LINUX32-P9: # %bb.0: # %entry +; CHECK-LINUX32-P9-NEXT: mflr r0 +; CHECK-LINUX32-P9-NEXT: stwu r1, -16(r1) +; CHECK-LINUX32-P9-NEXT: stw r0, 20(r1) +; CHECK-LINUX32-P9-NEXT: stw r3, 12(r1) +; CHECK-LINUX32-P9-NEXT: bl strlen +; CHECK-LINUX32-P9-NEXT: lwz r0, 20(r1) +; CHECK-LINUX32-P9-NEXT: addi r1, r1, 16 +; CHECK-LINUX32-P9-NEXT: mtlr r0 +; CHECK-LINUX32-P9-NEXT: blr +entry: + %str.addr = alloca ptr, align 4 + store ptr %str, ptr %str.addr, align 4 + %0 = load ptr, ptr %str.addr, align 4 + %call = call i32 @strlen(ptr noundef %0) #0 + ret i32 %call +} + declare i32 @strlen(ptr noundef) nounwind +attributes #0 = { strictfp } define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i32 noundef %num) #0 { ; CHECK-AIX-32-P9-LABEL: test_memmove: diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll index 2676000b968c3..1eb50d5f9564a 100644 --- a/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll +++ b/llvm/test/CodeGen/WebAssembly/fast-isel-pr138479.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=0 -verify-machineinstrs | FileCheck %s target triple = "wasm32-unknown-unknown" @@ -13,3 +14,5 @@ define void @call_trunc_i64_to_i48(i64 %x) { call void @extern48(i48 %x48) ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll new file mode 100644 index 0000000000000..df14e1054d91b --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/fast-isel-simd128.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -fast-isel -fast-isel-abort=0 -mattr=+simd128 -verify-machineinstrs | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +define i8 @pr165438(<4 x i32> %0) { +; CHECK-LABEL: pr165438: +; CHECK: .functype pr165438 (v128) -> (i32) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.extract_lane_u 0 +; CHECK-NEXT: # fallthrough-return +entry: + %conv = trunc <4 x i32> %0 to <4 x i8> + br label %cond.true + + +cond.true: ; preds = %entry + %vecext = extractelement <4 x i8> %conv, i32 0 + ret i8 %vecext +} diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll index e2db8d4241420..b8bb417e1860c 100644 --- a/llvm/test/CodeGen/X86/build-vector-128.ll +++ b/llvm/test/CodeGen/X86/build-vector-128.ll @@ -410,6 +410,472 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ret <16 x i8> %ins15 } +; build vectors where integers operands are split (typically via legalization) + +define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind { +; SSE-32-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; SSE-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: movl %edi, %eax +; SSE2-64-NEXT: movl %esi, %ecx +; SSE2-64-NEXT: shrq $32, %rdi +; SSE2-64-NEXT: shrq $32, %rsi +; SSE2-64-NEXT: movd %ecx, %xmm1 +; SSE2-64-NEXT: movd %esi, %xmm0 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-64-NEXT: movd %eax, %xmm0 +; SSE2-64-NEXT: movd %edi, %xmm2 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: retq +; +; SSE41-64-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movl %edi, %eax +; SSE41-64-NEXT: movl %esi, %ecx +; SSE41-64-NEXT: shrq $32, %rdi +; SSE41-64-NEXT: shrq $32, %rsi +; SSE41-64-NEXT: movd %eax, %xmm0 +; SSE41-64-NEXT: pinsrd $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-64-NEXT: pinsrd $3, %esi, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v2i64_split_v4i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v2i64_split_v4i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: movl %edi, %eax +; AVX-64-NEXT: movl %esi, %ecx +; AVX-64-NEXT: shrq $32, %rdi +; AVX-64-NEXT: shrq $32, %rsi +; AVX-64-NEXT: vmovd %eax, %xmm0 +; AVX-64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i64 %a0 to i32 + %a1.lo = trunc i64 %a1 to i32 + %a0.shr = lshr i64 %a0, 32 + %a1.shr = lshr i64 %a1, 32 + %a0.hi = trunc i64 %a0.shr to i32 + %a1.hi = trunc i64 %a1.shr to i32 + %v0 = insertelement <4 x i32> poison, i32 %a0.lo, i64 0 + %v1 = insertelement <4 x i32> %v0, i32 %a0.hi, i64 1 + %v2 = insertelement <4 x i32> %v1, i32 %a1.lo, i64 2 + %v3 = insertelement <4 x i32> %v2, i32 %a1.hi, i64 3 + ret <4 x i32> %v3 +} + +define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind { +; SSE2-32-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE2-32: # %bb.0: +; SSE2-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE2-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %edx, %xmm1 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-64-NEXT: movd %esi, %xmm2 +; SSE2-64-NEXT: movd %edi, %xmm0 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: retq +; +; SSE41-32-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE41-32: # %bb.0: +; SSE41-32-NEXT: pushl %esi +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE41-32-NEXT: movd %esi, %xmm0 +; SSE41-32-NEXT: shrl $16, %esi +; SSE41-32-NEXT: pinsrw $1, %esi, %xmm0 +; SSE41-32-NEXT: pinsrw $2, %edx, %xmm0 +; SSE41-32-NEXT: shrl $16, %edx +; SSE41-32-NEXT: pinsrw $3, %edx, %xmm0 +; SSE41-32-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE41-32-NEXT: shrl $16, %ecx +; SSE41-32-NEXT: pinsrw $5, %ecx, %xmm0 +; SSE41-32-NEXT: pinsrw $6, %eax, %xmm0 +; SSE41-32-NEXT: shrl $16, %eax +; SSE41-32-NEXT: pinsrw $7, %eax, %xmm0 +; SSE41-32-NEXT: popl %esi +; SSE41-32-NEXT: retl +; +; SSE41-64-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movd %edi, %xmm0 +; SSE41-64-NEXT: shrl $16, %edi +; SSE41-64-NEXT: pinsrw $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrw $2, %esi, %xmm0 +; SSE41-64-NEXT: shrl $16, %esi +; SSE41-64-NEXT: pinsrw $3, %esi, %xmm0 +; SSE41-64-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-64-NEXT: shrl $16, %edx +; SSE41-64-NEXT: pinsrw $5, %edx, %xmm0 +; SSE41-64-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $16, %ecx +; SSE41-64-NEXT: pinsrw $7, %ecx, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v4i32_split_v8i16: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %esi +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %esi +; AVX-32-NEXT: vmovd %esi, %xmm0 +; AVX-32-NEXT: shrl $16, %esi +; AVX-32-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %edx +; AVX-32-NEXT: vpinsrw $3, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %ecx +; AVX-32-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $16, %eax +; AVX-32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: popl %esi +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v4i32_split_v8i16: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: shrl $16, %edi +; AVX-64-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $2, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %esi +; AVX-64-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %edx +; AVX-64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $16, %ecx +; AVX-64-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i32 %a0 to i16 + %a1.lo = trunc i32 %a1 to i16 + %a2.lo = trunc i32 %a2 to i16 + %a3.lo = trunc i32 %a3 to i16 + %a0.shr = lshr i32 %a0, 16 + %a1.shr = lshr i32 %a1, 16 + %a2.shr = lshr i32 %a2, 16 + %a3.shr = lshr i32 %a3, 16 + %a0.hi = trunc i32 %a0.shr to i16 + %a1.hi = trunc i32 %a1.shr to i16 + %a2.hi = trunc i32 %a2.shr to i16 + %a3.hi = trunc i32 %a3.shr to i16 + %v0 = insertelement <8 x i16> poison, i16 %a0.lo, i64 0 + %v1 = insertelement <8 x i16> %v0, i16 %a0.hi, i64 1 + %v2 = insertelement <8 x i16> %v1, i16 %a1.lo, i64 2 + %v3 = insertelement <8 x i16> %v2, i16 %a1.hi, i64 3 + %v4 = insertelement <8 x i16> %v3, i16 %a2.lo, i64 4 + %v5 = insertelement <8 x i16> %v4, i16 %a2.hi, i64 5 + %v6 = insertelement <8 x i16> %v5, i16 %a3.lo, i64 6 + %v7 = insertelement <8 x i16> %v6, i16 %a3.hi, i64 7 + ret <8 x i16> %v7 +} + +define <16 x i8> @test_buildvector_v8i16_split_v16i8(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind { +; SSE2-32-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE2-32: # %bb.0: +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm1 +; SSE2-32-NEXT: movdqa %xmm1, %xmm0 +; SSE2-32-NEXT: psrld $8, %xmm0 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm2 +; SSE2-32-NEXT: psrld $8, %xmm2 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm2 +; SSE2-32-NEXT: movdqa %xmm2, %xmm1 +; SSE2-32-NEXT: psrld $8, %xmm1 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm1 +; SSE2-32-NEXT: movdqa %xmm1, %xmm3 +; SSE2-32-NEXT: psrld $8, %xmm3 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm2 +; SSE2-32-NEXT: psrld $8, %xmm2 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm2 +; SSE2-32-NEXT: movdqa %xmm2, %xmm3 +; SSE2-32-NEXT: psrld $8, %xmm3 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm3 +; SSE2-32-NEXT: movdqa %xmm3, %xmm0 +; SSE2-32-NEXT: psrld $8, %xmm0 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE2-32-NEXT: movd %eax, %xmm0 +; SSE2-32-NEXT: movdqa %xmm0, %xmm4 +; SSE2-32-NEXT: psrld $8, %xmm4 +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: pushq %rbp +; SSE2-64-NEXT: pushq %r15 +; SSE2-64-NEXT: pushq %r14 +; SSE2-64-NEXT: pushq %rbx +; SSE2-64-NEXT: movzwl %di, %eax +; SSE2-64-NEXT: movzwl %si, %r10d +; SSE2-64-NEXT: movzwl %dx, %r11d +; SSE2-64-NEXT: movzwl %cx, %ebx +; SSE2-64-NEXT: movzwl %r8w, %ebp +; SSE2-64-NEXT: movzwl %r9w, %r14d +; SSE2-64-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; SSE2-64-NEXT: movd %r15d, %xmm0 +; SSE2-64-NEXT: movdqa %xmm0, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-64-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; SSE2-64-NEXT: movd %r15d, %xmm2 +; SSE2-64-NEXT: movdqa %xmm2, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-64-NEXT: movd %r9d, %xmm0 +; SSE2-64-NEXT: movd %r14d, %xmm1 +; SSE2-64-NEXT: psrld $8, %xmm1 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-64-NEXT: movd %r8d, %xmm1 +; SSE2-64-NEXT: movd %ebp, %xmm3 +; SSE2-64-NEXT: psrld $8, %xmm3 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %ebx, %xmm2 +; SSE2-64-NEXT: psrld $8, %xmm2 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-64-NEXT: movd %edx, %xmm2 +; SSE2-64-NEXT: movd %r11d, %xmm3 +; SSE2-64-NEXT: psrld $8, %xmm3 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-64-NEXT: movd %esi, %xmm3 +; SSE2-64-NEXT: movd %r10d, %xmm0 +; SSE2-64-NEXT: psrld $8, %xmm0 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-64-NEXT: movd %edi, %xmm0 +; SSE2-64-NEXT: movd %eax, %xmm4 +; SSE2-64-NEXT: psrld $8, %xmm4 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: popq %rbx +; SSE2-64-NEXT: popq %r14 +; SSE2-64-NEXT: popq %r15 +; SSE2-64-NEXT: popq %rbp +; SSE2-64-NEXT: retq +; +; SSE41-32-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE41-32: # %bb.0: +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: movd %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $1, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $2, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE41-32-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-32-NEXT: shrl $8, %eax +; SSE41-32-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-32-NEXT: retl +; +; SSE41-64-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movd %edi, %xmm0 +; SSE41-64-NEXT: shrl $8, %edi +; SSE41-64-NEXT: pinsrb $1, %edi, %xmm0 +; SSE41-64-NEXT: pinsrb $2, %esi, %xmm0 +; SSE41-64-NEXT: shrl $8, %esi +; SSE41-64-NEXT: pinsrb $3, %esi, %xmm0 +; SSE41-64-NEXT: pinsrb $4, %edx, %xmm0 +; SSE41-64-NEXT: shrl $8, %edx +; SSE41-64-NEXT: pinsrb $5, %edx, %xmm0 +; SSE41-64-NEXT: pinsrb $6, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $8, %ecx +; SSE41-64-NEXT: pinsrb $7, %ecx, %xmm0 +; SSE41-64-NEXT: pinsrb $8, %r8d, %xmm0 +; SSE41-64-NEXT: shrl $8, %r8d +; SSE41-64-NEXT: pinsrb $9, %r8d, %xmm0 +; SSE41-64-NEXT: pinsrb $10, %r9d, %xmm0 +; SSE41-64-NEXT: shrl $8, %r9d +; SSE41-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-64-NEXT: pinsrb $11, %r9d, %xmm0 +; SSE41-64-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-64-NEXT: shrl $8, %eax +; SSE41-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; SSE41-64-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-64-NEXT: pinsrb $14, %ecx, %xmm0 +; SSE41-64-NEXT: shrl $8, %ecx +; SSE41-64-NEXT: pinsrb $15, %ecx, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v8i16_split_v16i8: +; AVX-32: # %bb.0: +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vmovd %eax, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: shrl $8, %eax +; AVX-32-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v8i16_split_v16i8: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: shrl $8, %edi +; AVX-64-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %esi +; AVX-64-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %edx +; AVX-64-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %ecx +; AVX-64-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %r8d +; AVX-64-NEXT: vpinsrb $9, %r8d, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %r9d +; AVX-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX-64-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %eax +; AVX-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX-64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: shrl $8, %ecx +; AVX-64-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i16 %a0 to i8 + %a1.lo = trunc i16 %a1 to i8 + %a2.lo = trunc i16 %a2 to i8 + %a3.lo = trunc i16 %a3 to i8 + %a4.lo = trunc i16 %a4 to i8 + %a5.lo = trunc i16 %a5 to i8 + %a6.lo = trunc i16 %a6 to i8 + %a7.lo = trunc i16 %a7 to i8 + %a0.shr = lshr i16 %a0, 8 + %a1.shr = lshr i16 %a1, 8 + %a2.shr = lshr i16 %a2, 8 + %a3.shr = lshr i16 %a3, 8 + %a4.shr = lshr i16 %a4, 8 + %a5.shr = lshr i16 %a5, 8 + %a6.shr = lshr i16 %a6, 8 + %a7.shr = lshr i16 %a7, 8 + %a0.hi = trunc i16 %a0.shr to i8 + %a1.hi = trunc i16 %a1.shr to i8 + %a2.hi = trunc i16 %a2.shr to i8 + %a3.hi = trunc i16 %a3.shr to i8 + %a4.hi = trunc i16 %a4.shr to i8 + %a5.hi = trunc i16 %a5.shr to i8 + %a6.hi = trunc i16 %a6.shr to i8 + %a7.hi = trunc i16 %a7.shr to i8 + %v0 = insertelement <16 x i8> poison, i8 %a0.lo, i64 0 + %v1 = insertelement <16 x i8> %v0, i8 %a0.hi, i64 1 + %v2 = insertelement <16 x i8> %v1, i8 %a1.lo, i64 2 + %v3 = insertelement <16 x i8> %v2, i8 %a1.hi, i64 3 + %v4 = insertelement <16 x i8> %v3, i8 %a2.lo, i64 4 + %v5 = insertelement <16 x i8> %v4, i8 %a2.hi, i64 5 + %v6 = insertelement <16 x i8> %v5, i8 %a3.lo, i64 6 + %v7 = insertelement <16 x i8> %v6, i8 %a3.hi, i64 7 + %v8 = insertelement <16 x i8> %v7, i8 %a4.lo, i64 8 + %v9 = insertelement <16 x i8> %v8, i8 %a4.hi, i64 9 + %v10 = insertelement <16 x i8> %v9, i8 %a5.lo, i64 10 + %v11 = insertelement <16 x i8> %v10, i8 %a5.hi, i64 11 + %v12 = insertelement <16 x i8> %v11, i8 %a6.lo, i64 12 + %v13 = insertelement <16 x i8> %v12, i8 %a6.hi, i64 13 + %v14 = insertelement <16 x i8> %v13, i8 %a7.lo, i64 14 + %v15 = insertelement <16 x i8> %v14, i8 %a7.hi, i64 15 + ret <16 x i8> %v15 +} + ; build vectors of repeated elements define <4 x float> @test_buildvector_4f32_2_var(float %a0, float %a1) { diff --git a/llvm/test/CodeGen/X86/regalloc-fp.ll b/llvm/test/CodeGen/X86/regalloc-fp.ll new file mode 100644 index 0000000000000..e89e5ab1d6b59 --- /dev/null +++ b/llvm/test/CodeGen/X86/regalloc-fp.ll @@ -0,0 +1,775 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Context: +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +define i32 @check_none() "frame-pointer"="none" { +; CHECK-LABEL: check_none: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + %reg14 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + store volatile i32 20, ptr %reg14, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = load volatile i32, ptr %reg14, align 4 + %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13 + %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + store volatile i32 %asmresult14, ptr %reg14, align 4 + ret i32 0 +} + +define i32 @test_non_leaf_no_reserve() "frame-pointer"="non-leaf-no-reserve" { +; CHECK-LABEL: test_non_leaf_no_reserve: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + %reg14 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + store volatile i32 20, ptr %reg14, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = load volatile i32, ptr %reg14, align 4 + %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13 + %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + store volatile i32 %asmresult14, ptr %reg14, align 4 + ret i32 0 +} + +define i32 @test_non_leaf() "frame-pointer"="non-leaf" { +; CHECK-LABEL: test_non_leaf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 +; CHECK-NEXT: .cfi_offset %r13, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} + +define i32 @test_reserved() "frame-pointer"="reserved" { +; CHECK-LABEL: test_reserved: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 +; CHECK-NEXT: .cfi_offset %r13, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} + +define i32 @test_all() "frame-pointer"="all" { +; CHECK-LABEL: test_all: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: movl $0, -96(%rbp) +; CHECK-NEXT: movl $1, -92(%rbp) +; CHECK-NEXT: movl $2, -88(%rbp) +; CHECK-NEXT: movl $3, -84(%rbp) +; CHECK-NEXT: movl $4, -80(%rbp) +; CHECK-NEXT: movl $5, -76(%rbp) +; CHECK-NEXT: movl $6, -72(%rbp) +; CHECK-NEXT: movl $7, -68(%rbp) +; CHECK-NEXT: movl $8, -64(%rbp) +; CHECK-NEXT: movl $9, -60(%rbp) +; CHECK-NEXT: movl $16, -56(%rbp) +; CHECK-NEXT: movl $17, -52(%rbp) +; CHECK-NEXT: movl $18, -48(%rbp) +; CHECK-NEXT: movl $19, -44(%rbp) +; CHECK-NEXT: movl -96(%rbp), %eax +; CHECK-NEXT: movl -92(%rbp), %ecx +; CHECK-NEXT: movl -88(%rbp), %edx +; CHECK-NEXT: movl -84(%rbp), %esi +; CHECK-NEXT: movl -80(%rbp), %edi +; CHECK-NEXT: movl -76(%rbp), %r8d +; CHECK-NEXT: movl -72(%rbp), %r9d +; CHECK-NEXT: movl -68(%rbp), %r10d +; CHECK-NEXT: movl -64(%rbp), %r11d +; CHECK-NEXT: movl -60(%rbp), %ebx +; CHECK-NEXT: movl -56(%rbp), %r14d +; CHECK-NEXT: movl -52(%rbp), %r15d +; CHECK-NEXT: movl -48(%rbp), %r12d +; CHECK-NEXT: movl -44(%rbp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -96(%rbp) +; CHECK-NEXT: movl %ecx, -92(%rbp) +; CHECK-NEXT: movl %edx, -88(%rbp) +; CHECK-NEXT: movl %esi, -84(%rbp) +; CHECK-NEXT: movl %edi, -80(%rbp) +; CHECK-NEXT: movl %r8d, -76(%rbp) +; CHECK-NEXT: movl %r9d, -72(%rbp) +; CHECK-NEXT: movl %r10d, -68(%rbp) +; CHECK-NEXT: movl %r11d, -64(%rbp) +; CHECK-NEXT: movl %ebx, -60(%rbp) +; CHECK-NEXT: movl %r14d, -56(%rbp) +; CHECK-NEXT: movl %r15d, -52(%rbp) +; CHECK-NEXT: movl %r12d, -48(%rbp) +; CHECK-NEXT: movl %r13d, -44(%rbp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s index e38a2f39b7c22..743181655a5cc 100644 --- a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s +++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s @@ -9,7 +9,6 @@ # RUN: -abs extern_out_of_range32=0x7fff00000000 \ # RUN: -abs extern_in_range32=0xffe00000 \ # RUN: -check %s %t/elf_pic_reloc.o -# XFAIL: * .text .section .text.main @@ -62,8 +61,9 @@ test_call_extern_plt: # Check PLT stub relocation for lgrl(Delta32dbl). # # jitlink-check: *{4}(stub_addr(elf_pic_reloc.o, extern_out_of_range32) + 2) = \ -# jitlink-check: (got_addr(elf_pic_reloc.o, extern_out_of_range32) - \ -# jitlink-check: stub_addr(elf_pic_reloc.o, extern_out_of_range32)) >> 1 +# jitlink-check: ((got_addr(elf_pic_reloc.o, extern_out_of_range32) - \ +# jitlink-check: stub_addr(elf_pic_reloc.o, extern_out_of_range32)) >> 1) \ +# jitlink-check: & 0xffffffff .globl test_call_extern_plt_stub .p2align 4 .type test_call_extern_plt_stub,@function diff --git a/llvm/test/MC/MachO/invalid-section-index.s b/llvm/test/MC/MachO/invalid-section-index.s new file mode 100644 index 0000000000000..80e1c64d9cbd5 --- /dev/null +++ b/llvm/test/MC/MachO/invalid-section-index.s @@ -0,0 +1,1572 @@ +/// Test that when there are more than 255 sections, error is shown specifying too many sections. +// REQUIRES: stability + +// RUN: not llvm-mc -filetype=obj -triple arm64-apple-macos %s -o - 2>&1 | FileCheck %s --check-prefix=MACHOERROR + +// MACHOERROR: error: Too many sections! +// MACHOERROR-NEXT: error: Invalid section index! +// MACHOERROR-NEXT: error: Invalid section index! + + .section __TEXT,__text,regular,pure_instructions + .globl _main ; -- Begin function main + .p2align 2 +_main: ; @main + .cfi_startproc +; %bb.0: ; %entry + sub sp, sp, #16 + .cfi_def_cfa_offset 16 + mov w0, #0 ; =0x0 + str wzr, [sp, #12] + add sp, sp, #16 + ret + .cfi_endproc + ; -- End function + .section seg,sect0 + .globl _var0 ; @var0 + .p2align 2, 0x0 +_var0: + .long 0 ; 0x0 + + .section seg,sect1 + .globl _var1 ; @var1 + .p2align 2, 0x0 +_var1: + .long 1 ; 0x1 + + .section seg,sect2 + .globl _var2 ; @var2 + .p2align 2, 0x0 +_var2: + .long 2 ; 0x2 + + .section seg,sect3 + .globl _var3 ; @var3 + .p2align 2, 0x0 +_var3: + .long 3 ; 0x3 + + .section seg,sect4 + .globl _var4 ; @var4 + .p2align 2, 0x0 +_var4: + .long 4 ; 0x4 + + .section seg,sect5 + .globl _var5 ; @var5 + .p2align 2, 0x0 +_var5: + .long 5 ; 0x5 + + .section seg,sect6 + .globl _var6 ; @var6 + .p2align 2, 0x0 +_var6: + .long 6 ; 0x6 + + .section seg,sect7 + .globl _var7 ; @var7 + .p2align 2, 0x0 +_var7: + .long 7 ; 0x7 + + .section seg,sect8 + .globl _var8 ; @var8 + .p2align 2, 0x0 +_var8: + .long 8 ; 0x8 + + .section seg,sect9 + .globl _var9 ; @var9 + .p2align 2, 0x0 +_var9: + .long 9 ; 0x9 + + .section seg,sect10 + .globl _var10 ; @var10 + .p2align 2, 0x0 +_var10: + .long 10 ; 0xa + + .section seg,sect11 + .globl _var11 ; @var11 + .p2align 2, 0x0 +_var11: + .long 11 ; 0xb + + .section seg,sect12 + .globl _var12 ; @var12 + .p2align 2, 0x0 +_var12: + .long 12 ; 0xc + + .section seg,sect13 + .globl _var13 ; @var13 + .p2align 2, 0x0 +_var13: + .long 13 ; 0xd + + .section seg,sect14 + .globl _var14 ; @var14 + .p2align 2, 0x0 +_var14: + .long 14 ; 0xe + + .section seg,sect15 + .globl _var15 ; @var15 + .p2align 2, 0x0 +_var15: + .long 15 ; 0xf + + .section seg,sect16 + .globl _var16 ; @var16 + .p2align 2, 0x0 +_var16: + .long 16 ; 0x10 + + .section seg,sect17 + .globl _var17 ; @var17 + .p2align 2, 0x0 +_var17: + .long 17 ; 0x11 + + .section seg,sect18 + .globl _var18 ; @var18 + .p2align 2, 0x0 +_var18: + .long 18 ; 0x12 + + .section seg,sect19 + .globl _var19 ; @var19 + .p2align 2, 0x0 +_var19: + .long 19 ; 0x13 + + .section seg,sect20 + .globl _var20 ; @var20 + .p2align 2, 0x0 +_var20: + .long 20 ; 0x14 + + .section seg,sect21 + .globl _var21 ; @var21 + .p2align 2, 0x0 +_var21: + .long 21 ; 0x15 + + .section seg,sect22 + .globl _var22 ; @var22 + .p2align 2, 0x0 +_var22: + .long 22 ; 0x16 + + .section seg,sect23 + .globl _var23 ; @var23 + .p2align 2, 0x0 +_var23: + .long 23 ; 0x17 + + .section seg,sect24 + .globl _var24 ; @var24 + .p2align 2, 0x0 +_var24: + .long 24 ; 0x18 + + .section seg,sect25 + .globl _var25 ; @var25 + .p2align 2, 0x0 +_var25: + .long 25 ; 0x19 + + .section seg,sect26 + .globl _var26 ; @var26 + .p2align 2, 0x0 +_var26: + .long 26 ; 0x1a + + .section seg,sect27 + .globl _var27 ; @var27 + .p2align 2, 0x0 +_var27: + .long 27 ; 0x1b + + .section seg,sect28 + .globl _var28 ; @var28 + .p2align 2, 0x0 +_var28: + .long 28 ; 0x1c + + .section seg,sect29 + .globl _var29 ; @var29 + .p2align 2, 0x0 +_var29: + .long 29 ; 0x1d + + .section seg,sect30 + .globl _var30 ; @var30 + .p2align 2, 0x0 +_var30: + .long 30 ; 0x1e + + .section seg,sect31 + .globl _var31 ; @var31 + .p2align 2, 0x0 +_var31: + .long 31 ; 0x1f + + .section seg,sect32 + .globl _var32 ; @var32 + .p2align 2, 0x0 +_var32: + .long 32 ; 0x20 + + .section seg,sect33 + .globl _var33 ; @var33 + .p2align 2, 0x0 +_var33: + .long 33 ; 0x21 + + .section seg,sect34 + .globl _var34 ; @var34 + .p2align 2, 0x0 +_var34: + .long 34 ; 0x22 + + .section seg,sect35 + .globl _var35 ; @var35 + .p2align 2, 0x0 +_var35: + .long 35 ; 0x23 + + .section seg,sect36 + .globl _var36 ; @var36 + .p2align 2, 0x0 +_var36: + .long 36 ; 0x24 + + .section seg,sect37 + .globl _var37 ; @var37 + .p2align 2, 0x0 +_var37: + .long 37 ; 0x25 + + .section seg,sect38 + .globl _var38 ; @var38 + .p2align 2, 0x0 +_var38: + .long 38 ; 0x26 + + .section seg,sect39 + .globl _var39 ; @var39 + .p2align 2, 0x0 +_var39: + .long 39 ; 0x27 + + .section seg,sect40 + .globl _var40 ; @var40 + .p2align 2, 0x0 +_var40: + .long 40 ; 0x28 + + .section seg,sect41 + .globl _var41 ; @var41 + .p2align 2, 0x0 +_var41: + .long 41 ; 0x29 + + .section seg,sect42 + .globl _var42 ; @var42 + .p2align 2, 0x0 +_var42: + .long 42 ; 0x2a + + .section seg,sect43 + .globl _var43 ; @var43 + .p2align 2, 0x0 +_var43: + .long 43 ; 0x2b + + .section seg,sect44 + .globl _var44 ; @var44 + .p2align 2, 0x0 +_var44: + .long 44 ; 0x2c + + .section seg,sect45 + .globl _var45 ; @var45 + .p2align 2, 0x0 +_var45: + .long 45 ; 0x2d + + .section seg,sect46 + .globl _var46 ; @var46 + .p2align 2, 0x0 +_var46: + .long 46 ; 0x2e + + .section seg,sect47 + .globl _var47 ; @var47 + .p2align 2, 0x0 +_var47: + .long 47 ; 0x2f + + .section seg,sect48 + .globl _var48 ; @var48 + .p2align 2, 0x0 +_var48: + .long 48 ; 0x30 + + .section seg,sect49 + .globl _var49 ; @var49 + .p2align 2, 0x0 +_var49: + .long 49 ; 0x31 + + .section seg,sect50 + .globl _var50 ; @var50 + .p2align 2, 0x0 +_var50: + .long 50 ; 0x32 + + .section seg,sect51 + .globl _var51 ; @var51 + .p2align 2, 0x0 +_var51: + .long 51 ; 0x33 + + .section seg,sect52 + .globl _var52 ; @var52 + .p2align 2, 0x0 +_var52: + .long 52 ; 0x34 + + .section seg,sect53 + .globl _var53 ; @var53 + .p2align 2, 0x0 +_var53: + .long 53 ; 0x35 + + .section seg,sect54 + .globl _var54 ; @var54 + .p2align 2, 0x0 +_var54: + .long 54 ; 0x36 + + .section seg,sect55 + .globl _var55 ; @var55 + .p2align 2, 0x0 +_var55: + .long 55 ; 0x37 + + .section seg,sect56 + .globl _var56 ; @var56 + .p2align 2, 0x0 +_var56: + .long 56 ; 0x38 + + .section seg,sect57 + .globl _var57 ; @var57 + .p2align 2, 0x0 +_var57: + .long 57 ; 0x39 + + .section seg,sect58 + .globl _var58 ; @var58 + .p2align 2, 0x0 +_var58: + .long 58 ; 0x3a + + .section seg,sect59 + .globl _var59 ; @var59 + .p2align 2, 0x0 +_var59: + .long 59 ; 0x3b + + .section seg,sect60 + .globl _var60 ; @var60 + .p2align 2, 0x0 +_var60: + .long 60 ; 0x3c + + .section seg,sect61 + .globl _var61 ; @var61 + .p2align 2, 0x0 +_var61: + .long 61 ; 0x3d + + .section seg,sect62 + .globl _var62 ; @var62 + .p2align 2, 0x0 +_var62: + .long 62 ; 0x3e + + .section seg,sect63 + .globl _var63 ; @var63 + .p2align 2, 0x0 +_var63: + .long 63 ; 0x3f + + .section seg,sect64 + .globl _var64 ; @var64 + .p2align 2, 0x0 +_var64: + .long 64 ; 0x40 + + .section seg,sect65 + .globl _var65 ; @var65 + .p2align 2, 0x0 +_var65: + .long 65 ; 0x41 + + .section seg,sect66 + .globl _var66 ; @var66 + .p2align 2, 0x0 +_var66: + .long 66 ; 0x42 + + .section seg,sect67 + .globl _var67 ; @var67 + .p2align 2, 0x0 +_var67: + .long 67 ; 0x43 + + .section seg,sect68 + .globl _var68 ; @var68 + .p2align 2, 0x0 +_var68: + .long 68 ; 0x44 + + .section seg,sect69 + .globl _var69 ; @var69 + .p2align 2, 0x0 +_var69: + .long 69 ; 0x45 + + .section seg,sect70 + .globl _var70 ; @var70 + .p2align 2, 0x0 +_var70: + .long 70 ; 0x46 + + .section seg,sect71 + .globl _var71 ; @var71 + .p2align 2, 0x0 +_var71: + .long 71 ; 0x47 + + .section seg,sect72 + .globl _var72 ; @var72 + .p2align 2, 0x0 +_var72: + .long 72 ; 0x48 + + .section seg,sect73 + .globl _var73 ; @var73 + .p2align 2, 0x0 +_var73: + .long 73 ; 0x49 + + .section seg,sect74 + .globl _var74 ; @var74 + .p2align 2, 0x0 +_var74: + .long 74 ; 0x4a + + .section seg,sect75 + .globl _var75 ; @var75 + .p2align 2, 0x0 +_var75: + .long 75 ; 0x4b + + .section seg,sect76 + .globl _var76 ; @var76 + .p2align 2, 0x0 +_var76: + .long 76 ; 0x4c + + .section seg,sect77 + .globl _var77 ; @var77 + .p2align 2, 0x0 +_var77: + .long 77 ; 0x4d + + .section seg,sect78 + .globl _var78 ; @var78 + .p2align 2, 0x0 +_var78: + .long 78 ; 0x4e + + .section seg,sect79 + .globl _var79 ; @var79 + .p2align 2, 0x0 +_var79: + .long 79 ; 0x4f + + .section seg,sect80 + .globl _var80 ; @var80 + .p2align 2, 0x0 +_var80: + .long 80 ; 0x50 + + .section seg,sect81 + .globl _var81 ; @var81 + .p2align 2, 0x0 +_var81: + .long 81 ; 0x51 + + .section seg,sect82 + .globl _var82 ; @var82 + .p2align 2, 0x0 +_var82: + .long 82 ; 0x52 + + .section seg,sect83 + .globl _var83 ; @var83 + .p2align 2, 0x0 +_var83: + .long 83 ; 0x53 + + .section seg,sect84 + .globl _var84 ; @var84 + .p2align 2, 0x0 +_var84: + .long 84 ; 0x54 + + .section seg,sect85 + .globl _var85 ; @var85 + .p2align 2, 0x0 +_var85: + .long 85 ; 0x55 + + .section seg,sect86 + .globl _var86 ; @var86 + .p2align 2, 0x0 +_var86: + .long 86 ; 0x56 + + .section seg,sect87 + .globl _var87 ; @var87 + .p2align 2, 0x0 +_var87: + .long 87 ; 0x57 + + .section seg,sect88 + .globl _var88 ; @var88 + .p2align 2, 0x0 +_var88: + .long 88 ; 0x58 + + .section seg,sect89 + .globl _var89 ; @var89 + .p2align 2, 0x0 +_var89: + .long 89 ; 0x59 + + .section seg,sect90 + .globl _var90 ; @var90 + .p2align 2, 0x0 +_var90: + .long 90 ; 0x5a + + .section seg,sect91 + .globl _var91 ; @var91 + .p2align 2, 0x0 +_var91: + .long 91 ; 0x5b + + .section seg,sect92 + .globl _var92 ; @var92 + .p2align 2, 0x0 +_var92: + .long 92 ; 0x5c + + .section seg,sect93 + .globl _var93 ; @var93 + .p2align 2, 0x0 +_var93: + .long 93 ; 0x5d + + .section seg,sect94 + .globl _var94 ; @var94 + .p2align 2, 0x0 +_var94: + .long 94 ; 0x5e + + .section seg,sect95 + .globl _var95 ; @var95 + .p2align 2, 0x0 +_var95: + .long 95 ; 0x5f + + .section seg,sect96 + .globl _var96 ; @var96 + .p2align 2, 0x0 +_var96: + .long 96 ; 0x60 + + .section seg,sect97 + .globl _var97 ; @var97 + .p2align 2, 0x0 +_var97: + .long 97 ; 0x61 + + .section seg,sect98 + .globl _var98 ; @var98 + .p2align 2, 0x0 +_var98: + .long 98 ; 0x62 + + .section seg,sect99 + .globl _var99 ; @var99 + .p2align 2, 0x0 +_var99: + .long 99 ; 0x63 + + .section seg,sect100 + .globl _var100 ; @var100 + .p2align 2, 0x0 +_var100: + .long 100 ; 0x64 + + .section seg,sect101 + .globl _var101 ; @var101 + .p2align 2, 0x0 +_var101: + .long 101 ; 0x65 + + .section seg,sect102 + .globl _var102 ; @var102 + .p2align 2, 0x0 +_var102: + .long 102 ; 0x66 + + .section seg,sect103 + .globl _var103 ; @var103 + .p2align 2, 0x0 +_var103: + .long 103 ; 0x67 + + .section seg,sect104 + .globl _var104 ; @var104 + .p2align 2, 0x0 +_var104: + .long 104 ; 0x68 + + .section seg,sect105 + .globl _var105 ; @var105 + .p2align 2, 0x0 +_var105: + .long 105 ; 0x69 + + .section seg,sect106 + .globl _var106 ; @var106 + .p2align 2, 0x0 +_var106: + .long 106 ; 0x6a + + .section seg,sect107 + .globl _var107 ; @var107 + .p2align 2, 0x0 +_var107: + .long 107 ; 0x6b + + .section seg,sect108 + .globl _var108 ; @var108 + .p2align 2, 0x0 +_var108: + .long 108 ; 0x6c + + .section seg,sect109 + .globl _var109 ; @var109 + .p2align 2, 0x0 +_var109: + .long 109 ; 0x6d + + .section seg,sect110 + .globl _var110 ; @var110 + .p2align 2, 0x0 +_var110: + .long 110 ; 0x6e + + .section seg,sect111 + .globl _var111 ; @var111 + .p2align 2, 0x0 +_var111: + .long 111 ; 0x6f + + .section seg,sect112 + .globl _var112 ; @var112 + .p2align 2, 0x0 +_var112: + .long 112 ; 0x70 + + .section seg,sect113 + .globl _var113 ; @var113 + .p2align 2, 0x0 +_var113: + .long 113 ; 0x71 + + .section seg,sect114 + .globl _var114 ; @var114 + .p2align 2, 0x0 +_var114: + .long 114 ; 0x72 + + .section seg,sect115 + .globl _var115 ; @var115 + .p2align 2, 0x0 +_var115: + .long 115 ; 0x73 + + .section seg,sect116 + .globl _var116 ; @var116 + .p2align 2, 0x0 +_var116: + .long 116 ; 0x74 + + .section seg,sect117 + .globl _var117 ; @var117 + .p2align 2, 0x0 +_var117: + .long 117 ; 0x75 + + .section seg,sect118 + .globl _var118 ; @var118 + .p2align 2, 0x0 +_var118: + .long 118 ; 0x76 + + .section seg,sect119 + .globl _var119 ; @var119 + .p2align 2, 0x0 +_var119: + .long 119 ; 0x77 + + .section seg,sect120 + .globl _var120 ; @var120 + .p2align 2, 0x0 +_var120: + .long 120 ; 0x78 + + .section seg,sect121 + .globl _var121 ; @var121 + .p2align 2, 0x0 +_var121: + .long 121 ; 0x79 + + .section seg,sect122 + .globl _var122 ; @var122 + .p2align 2, 0x0 +_var122: + .long 122 ; 0x7a + + .section seg,sect123 + .globl _var123 ; @var123 + .p2align 2, 0x0 +_var123: + .long 123 ; 0x7b + + .section seg,sect124 + .globl _var124 ; @var124 + .p2align 2, 0x0 +_var124: + .long 124 ; 0x7c + + .section seg,sect125 + .globl _var125 ; @var125 + .p2align 2, 0x0 +_var125: + .long 125 ; 0x7d + + .section seg,sect126 + .globl _var126 ; @var126 + .p2align 2, 0x0 +_var126: + .long 126 ; 0x7e + + .section seg,sect127 + .globl _var127 ; @var127 + .p2align 2, 0x0 +_var127: + .long 127 ; 0x7f + + .section seg,sect128 + .globl _var128 ; @var128 + .p2align 2, 0x0 +_var128: + .long 128 ; 0x80 + + .section seg,sect129 + .globl _var129 ; @var129 + .p2align 2, 0x0 +_var129: + .long 129 ; 0x81 + + .section seg,sect130 + .globl _var130 ; @var130 + .p2align 2, 0x0 +_var130: + .long 130 ; 0x82 + + .section seg,sect131 + .globl _var131 ; @var131 + .p2align 2, 0x0 +_var131: + .long 131 ; 0x83 + + .section seg,sect132 + .globl _var132 ; @var132 + .p2align 2, 0x0 +_var132: + .long 132 ; 0x84 + + .section seg,sect133 + .globl _var133 ; @var133 + .p2align 2, 0x0 +_var133: + .long 133 ; 0x85 + + .section seg,sect134 + .globl _var134 ; @var134 + .p2align 2, 0x0 +_var134: + .long 134 ; 0x86 + + .section seg,sect135 + .globl _var135 ; @var135 + .p2align 2, 0x0 +_var135: + .long 135 ; 0x87 + + .section seg,sect136 + .globl _var136 ; @var136 + .p2align 2, 0x0 +_var136: + .long 136 ; 0x88 + + .section seg,sect137 + .globl _var137 ; @var137 + .p2align 2, 0x0 +_var137: + .long 137 ; 0x89 + + .section seg,sect138 + .globl _var138 ; @var138 + .p2align 2, 0x0 +_var138: + .long 138 ; 0x8a + + .section seg,sect139 + .globl _var139 ; @var139 + .p2align 2, 0x0 +_var139: + .long 139 ; 0x8b + + .section seg,sect140 + .globl _var140 ; @var140 + .p2align 2, 0x0 +_var140: + .long 140 ; 0x8c + + .section seg,sect141 + .globl _var141 ; @var141 + .p2align 2, 0x0 +_var141: + .long 141 ; 0x8d + + .section seg,sect142 + .globl _var142 ; @var142 + .p2align 2, 0x0 +_var142: + .long 142 ; 0x8e + + .section seg,sect143 + .globl _var143 ; @var143 + .p2align 2, 0x0 +_var143: + .long 143 ; 0x8f + + .section seg,sect144 + .globl _var144 ; @var144 + .p2align 2, 0x0 +_var144: + .long 144 ; 0x90 + + .section seg,sect145 + .globl _var145 ; @var145 + .p2align 2, 0x0 +_var145: + .long 145 ; 0x91 + + .section seg,sect146 + .globl _var146 ; @var146 + .p2align 2, 0x0 +_var146: + .long 146 ; 0x92 + + .section seg,sect147 + .globl _var147 ; @var147 + .p2align 2, 0x0 +_var147: + .long 147 ; 0x93 + + .section seg,sect148 + .globl _var148 ; @var148 + .p2align 2, 0x0 +_var148: + .long 148 ; 0x94 + + .section seg,sect149 + .globl _var149 ; @var149 + .p2align 2, 0x0 +_var149: + .long 149 ; 0x95 + + .section seg,sect150 + .globl _var150 ; @var150 + .p2align 2, 0x0 +_var150: + .long 150 ; 0x96 + + .section seg,sect151 + .globl _var151 ; @var151 + .p2align 2, 0x0 +_var151: + .long 151 ; 0x97 + + .section seg,sect152 + .globl _var152 ; @var152 + .p2align 2, 0x0 +_var152: + .long 152 ; 0x98 + + .section seg,sect153 + .globl _var153 ; @var153 + .p2align 2, 0x0 +_var153: + .long 153 ; 0x99 + + .section seg,sect154 + .globl _var154 ; @var154 + .p2align 2, 0x0 +_var154: + .long 154 ; 0x9a + + .section seg,sect155 + .globl _var155 ; @var155 + .p2align 2, 0x0 +_var155: + .long 155 ; 0x9b + + .section seg,sect156 + .globl _var156 ; @var156 + .p2align 2, 0x0 +_var156: + .long 156 ; 0x9c + + .section seg,sect157 + .globl _var157 ; @var157 + .p2align 2, 0x0 +_var157: + .long 157 ; 0x9d + + .section seg,sect158 + .globl _var158 ; @var158 + .p2align 2, 0x0 +_var158: + .long 158 ; 0x9e + + .section seg,sect159 + .globl _var159 ; @var159 + .p2align 2, 0x0 +_var159: + .long 159 ; 0x9f + + .section seg,sect160 + .globl _var160 ; @var160 + .p2align 2, 0x0 +_var160: + .long 160 ; 0xa0 + + .section seg,sect161 + .globl _var161 ; @var161 + .p2align 2, 0x0 +_var161: + .long 161 ; 0xa1 + + .section seg,sect162 + .globl _var162 ; @var162 + .p2align 2, 0x0 +_var162: + .long 162 ; 0xa2 + + .section seg,sect163 + .globl _var163 ; @var163 + .p2align 2, 0x0 +_var163: + .long 163 ; 0xa3 + + .section seg,sect164 + .globl _var164 ; @var164 + .p2align 2, 0x0 +_var164: + .long 164 ; 0xa4 + + .section seg,sect165 + .globl _var165 ; @var165 + .p2align 2, 0x0 +_var165: + .long 165 ; 0xa5 + + .section seg,sect166 + .globl _var166 ; @var166 + .p2align 2, 0x0 +_var166: + .long 166 ; 0xa6 + + .section seg,sect167 + .globl _var167 ; @var167 + .p2align 2, 0x0 +_var167: + .long 167 ; 0xa7 + + .section seg,sect168 + .globl _var168 ; @var168 + .p2align 2, 0x0 +_var168: + .long 168 ; 0xa8 + + .section seg,sect169 + .globl _var169 ; @var169 + .p2align 2, 0x0 +_var169: + .long 169 ; 0xa9 + + .section seg,sect170 + .globl _var170 ; @var170 + .p2align 2, 0x0 +_var170: + .long 170 ; 0xaa + + .section seg,sect171 + .globl _var171 ; @var171 + .p2align 2, 0x0 +_var171: + .long 171 ; 0xab + + .section seg,sect172 + .globl _var172 ; @var172 + .p2align 2, 0x0 +_var172: + .long 172 ; 0xac + + .section seg,sect173 + .globl _var173 ; @var173 + .p2align 2, 0x0 +_var173: + .long 173 ; 0xad + + .section seg,sect174 + .globl _var174 ; @var174 + .p2align 2, 0x0 +_var174: + .long 174 ; 0xae + + .section seg,sect175 + .globl _var175 ; @var175 + .p2align 2, 0x0 +_var175: + .long 175 ; 0xaf + + .section seg,sect176 + .globl _var176 ; @var176 + .p2align 2, 0x0 +_var176: + .long 176 ; 0xb0 + + .section seg,sect177 + .globl _var177 ; @var177 + .p2align 2, 0x0 +_var177: + .long 177 ; 0xb1 + + .section seg,sect178 + .globl _var178 ; @var178 + .p2align 2, 0x0 +_var178: + .long 178 ; 0xb2 + + .section seg,sect179 + .globl _var179 ; @var179 + .p2align 2, 0x0 +_var179: + .long 179 ; 0xb3 + + .section seg,sect180 + .globl _var180 ; @var180 + .p2align 2, 0x0 +_var180: + .long 180 ; 0xb4 + + .section seg,sect181 + .globl _var181 ; @var181 + .p2align 2, 0x0 +_var181: + .long 181 ; 0xb5 + + .section seg,sect182 + .globl _var182 ; @var182 + .p2align 2, 0x0 +_var182: + .long 182 ; 0xb6 + + .section seg,sect183 + .globl _var183 ; @var183 + .p2align 2, 0x0 +_var183: + .long 183 ; 0xb7 + + .section seg,sect184 + .globl _var184 ; @var184 + .p2align 2, 0x0 +_var184: + .long 184 ; 0xb8 + + .section seg,sect185 + .globl _var185 ; @var185 + .p2align 2, 0x0 +_var185: + .long 185 ; 0xb9 + + .section seg,sect186 + .globl _var186 ; @var186 + .p2align 2, 0x0 +_var186: + .long 186 ; 0xba + + .section seg,sect187 + .globl _var187 ; @var187 + .p2align 2, 0x0 +_var187: + .long 187 ; 0xbb + + .section seg,sect188 + .globl _var188 ; @var188 + .p2align 2, 0x0 +_var188: + .long 188 ; 0xbc + + .section seg,sect189 + .globl _var189 ; @var189 + .p2align 2, 0x0 +_var189: + .long 189 ; 0xbd + + .section seg,sect190 + .globl _var190 ; @var190 + .p2align 2, 0x0 +_var190: + .long 190 ; 0xbe + + .section seg,sect191 + .globl _var191 ; @var191 + .p2align 2, 0x0 +_var191: + .long 191 ; 0xbf + + .section seg,sect192 + .globl _var192 ; @var192 + .p2align 2, 0x0 +_var192: + .long 192 ; 0xc0 + + .section seg,sect193 + .globl _var193 ; @var193 + .p2align 2, 0x0 +_var193: + .long 193 ; 0xc1 + + .section seg,sect194 + .globl _var194 ; @var194 + .p2align 2, 0x0 +_var194: + .long 194 ; 0xc2 + + .section seg,sect195 + .globl _var195 ; @var195 + .p2align 2, 0x0 +_var195: + .long 195 ; 0xc3 + + .section seg,sect196 + .globl _var196 ; @var196 + .p2align 2, 0x0 +_var196: + .long 196 ; 0xc4 + + .section seg,sect197 + .globl _var197 ; @var197 + .p2align 2, 0x0 +_var197: + .long 197 ; 0xc5 + + .section seg,sect198 + .globl _var198 ; @var198 + .p2align 2, 0x0 +_var198: + .long 198 ; 0xc6 + + .section seg,sect199 + .globl _var199 ; @var199 + .p2align 2, 0x0 +_var199: + .long 199 ; 0xc7 + + .section seg,sect200 + .globl _var200 ; @var200 + .p2align 2, 0x0 +_var200: + .long 200 ; 0xc8 + + .section seg,sect201 + .globl _var201 ; @var201 + .p2align 2, 0x0 +_var201: + .long 201 ; 0xc9 + + .section seg,sect202 + .globl _var202 ; @var202 + .p2align 2, 0x0 +_var202: + .long 202 ; 0xca + + .section seg,sect203 + .globl _var203 ; @var203 + .p2align 2, 0x0 +_var203: + .long 203 ; 0xcb + + .section seg,sect204 + .globl _var204 ; @var204 + .p2align 2, 0x0 +_var204: + .long 204 ; 0xcc + + .section seg,sect205 + .globl _var205 ; @var205 + .p2align 2, 0x0 +_var205: + .long 205 ; 0xcd + + .section seg,sect206 + .globl _var206 ; @var206 + .p2align 2, 0x0 +_var206: + .long 206 ; 0xce + + .section seg,sect207 + .globl _var207 ; @var207 + .p2align 2, 0x0 +_var207: + .long 207 ; 0xcf + + .section seg,sect208 + .globl _var208 ; @var208 + .p2align 2, 0x0 +_var208: + .long 208 ; 0xd0 + + .section seg,sect209 + .globl _var209 ; @var209 + .p2align 2, 0x0 +_var209: + .long 209 ; 0xd1 + + .section seg,sect210 + .globl _var210 ; @var210 + .p2align 2, 0x0 +_var210: + .long 210 ; 0xd2 + + .section seg,sect211 + .globl _var211 ; @var211 + .p2align 2, 0x0 +_var211: + .long 211 ; 0xd3 + + .section seg,sect212 + .globl _var212 ; @var212 + .p2align 2, 0x0 +_var212: + .long 212 ; 0xd4 + + .section seg,sect213 + .globl _var213 ; @var213 + .p2align 2, 0x0 +_var213: + .long 213 ; 0xd5 + + .section seg,sect214 + .globl _var214 ; @var214 + .p2align 2, 0x0 +_var214: + .long 214 ; 0xd6 + + .section seg,sect215 + .globl _var215 ; @var215 + .p2align 2, 0x0 +_var215: + .long 215 ; 0xd7 + + .section seg,sect216 + .globl _var216 ; @var216 + .p2align 2, 0x0 +_var216: + .long 216 ; 0xd8 + + .section seg,sect217 + .globl _var217 ; @var217 + .p2align 2, 0x0 +_var217: + .long 217 ; 0xd9 + + .section seg,sect218 + .globl _var218 ; @var218 + .p2align 2, 0x0 +_var218: + .long 218 ; 0xda + + .section seg,sect219 + .globl _var219 ; @var219 + .p2align 2, 0x0 +_var219: + .long 219 ; 0xdb + + .section seg,sect220 + .globl _var220 ; @var220 + .p2align 2, 0x0 +_var220: + .long 220 ; 0xdc + + .section seg,sect221 + .globl _var221 ; @var221 + .p2align 2, 0x0 +_var221: + .long 221 ; 0xdd + + .section seg,sect222 + .globl _var222 ; @var222 + .p2align 2, 0x0 +_var222: + .long 222 ; 0xde + + .section seg,sect223 + .globl _var223 ; @var223 + .p2align 2, 0x0 +_var223: + .long 223 ; 0xdf + + .section seg,sect224 + .globl _var224 ; @var224 + .p2align 2, 0x0 +_var224: + .long 224 ; 0xe0 + + .section seg,sect225 + .globl _var225 ; @var225 + .p2align 2, 0x0 +_var225: + .long 225 ; 0xe1 + + .section seg,sect226 + .globl _var226 ; @var226 + .p2align 2, 0x0 +_var226: + .long 226 ; 0xe2 + + .section seg,sect227 + .globl _var227 ; @var227 + .p2align 2, 0x0 +_var227: + .long 227 ; 0xe3 + + .section seg,sect228 + .globl _var228 ; @var228 + .p2align 2, 0x0 +_var228: + .long 228 ; 0xe4 + + .section seg,sect229 + .globl _var229 ; @var229 + .p2align 2, 0x0 +_var229: + .long 229 ; 0xe5 + + .section seg,sect230 + .globl _var230 ; @var230 + .p2align 2, 0x0 +_var230: + .long 230 ; 0xe6 + + .section seg,sect231 + .globl _var231 ; @var231 + .p2align 2, 0x0 +_var231: + .long 231 ; 0xe7 + + .section seg,sect232 + .globl _var232 ; @var232 + .p2align 2, 0x0 +_var232: + .long 232 ; 0xe8 + + .section seg,sect233 + .globl _var233 ; @var233 + .p2align 2, 0x0 +_var233: + .long 233 ; 0xe9 + + .section seg,sect234 + .globl _var234 ; @var234 + .p2align 2, 0x0 +_var234: + .long 234 ; 0xea + + .section seg,sect235 + .globl _var235 ; @var235 + .p2align 2, 0x0 +_var235: + .long 235 ; 0xeb + + .section seg,sect236 + .globl _var236 ; @var236 + .p2align 2, 0x0 +_var236: + .long 236 ; 0xec + + .section seg,sect237 + .globl _var237 ; @var237 + .p2align 2, 0x0 +_var237: + .long 237 ; 0xed + + .section seg,sect238 + .globl _var238 ; @var238 + .p2align 2, 0x0 +_var238: + .long 238 ; 0xee + + .section seg,sect239 + .globl _var239 ; @var239 + .p2align 2, 0x0 +_var239: + .long 239 ; 0xef + + .section seg,sect240 + .globl _var240 ; @var240 + .p2align 2, 0x0 +_var240: + .long 240 ; 0xf0 + + .section seg,sect241 + .globl _var241 ; @var241 + .p2align 2, 0x0 +_var241: + .long 241 ; 0xf1 + + .section seg,sect242 + .globl _var242 ; @var242 + .p2align 2, 0x0 +_var242: + .long 242 ; 0xf2 + + .section seg,sect243 + .globl _var243 ; @var243 + .p2align 2, 0x0 +_var243: + .long 243 ; 0xf3 + + .section seg,sect244 + .globl _var244 ; @var244 + .p2align 2, 0x0 +_var244: + .long 244 ; 0xf4 + + .section seg,sect245 + .globl _var245 ; @var245 + .p2align 2, 0x0 +_var245: + .long 245 ; 0xf5 + + .section seg,sect246 + .globl _var246 ; @var246 + .p2align 2, 0x0 +_var246: + .long 246 ; 0xf6 + + .section seg,sect247 + .globl _var247 ; @var247 + .p2align 2, 0x0 +_var247: + .long 247 ; 0xf7 + + .section seg,sect248 + .globl _var248 ; @var248 + .p2align 2, 0x0 +_var248: + .long 248 ; 0xf8 + + .section seg,sect249 + .globl _var249 ; @var249 + .p2align 2, 0x0 +_var249: + .long 249 ; 0xf9 + + .section seg,sect250 + .globl _var250 ; @var250 + .p2align 2, 0x0 +_var250: + .long 250 ; 0xfa + + .section seg,sect251 + .globl _var251 ; @var251 + .p2align 2, 0x0 +_var251: + .long 251 ; 0xfb + + .section seg,sect252 + .globl _var252 ; @var252 + .p2align 2, 0x0 +_var252: + .long 252 ; 0xfc + + .section seg,sect253 + .globl _var253 ; @var253 + .p2align 2, 0x0 +_var253: + .long 253 ; 0xfd + + .section seg,sect254 + .globl _var254 ; @var254 + .p2align 2, 0x0 +_var254: + .long 254 ; 0xfe + + .section seg,sect255 + .globl _var255 ; @var255 + .p2align 2, 0x0 +_var255: + .long 255 ; 0xff + + .section seg,sect256 + .globl _var256 ; @var256 + .p2align 2, 0x0 +_var256: + .long 256 ; 0x100 + + .section seg,sect257 + .globl _var257 ; @var257 + .p2align 2, 0x0 +_var257: + .long 257 ; 0x101 + +.subsections_via_symbols diff --git a/llvm/test/Transforms/InstCombine/modular-format.ll b/llvm/test/Transforms/InstCombine/modular-format.ll new file mode 100644 index 0000000000000..d9b7b6f056f59 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/modular-format.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; Test that the modular format string library call simplifier works correctly. +; +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" + +@.str.int = constant [3 x i8] c"%d\00" +@.str.float = constant [3 x i8] c"%f\00" +@.str.multi = constant [6 x i8] c"%f %d\00" +@.str.noargs = constant [1 x i8] c"\00" + +;; No aspects are specified, so no transformation occurs. +define void @test_basic(i32 %arg) { +; CHECK-LABEL: @test_basic( +; CHECK-NEXT: call void (ptr, ...) @basic(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @basic(ptr @.str.int, i32 %arg) + ret void +} + +declare void @basic(ptr, ...) #0 + +;; The "float" aspect is present and needed, so no transformation occurs. +define void @test_float_present(double %arg) { +; CHECK-LABEL: @test_float_present( +; CHECK-NEXT: call void (ptr, ...) @float_present(ptr nonnull @.str.float, double [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @float_present(ptr @.str.float, double %arg) + ret void +} + +declare void @float_present(ptr, ...) #1 + +;; The "float" aspect is present but not needed, so the call is transformed. +define void @test_float_absent(i32 %arg) { +; CHECK-LABEL: @test_float_absent( +; CHECK-NEXT: call void (ptr, ...) @float_present_mod(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @float_absent(ptr @.str.int, i32 %arg) + ret void +} + +declare void @float_absent(ptr, ...) #1 + +;; Unknown aspects are always considered needed, so no transformation occurs. +define void @test_unknown_aspects(i32 %arg) { +; CHECK-LABEL: @test_unknown_aspects( +; CHECK-NEXT: call void (ptr, ...) @unknown_aspects(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @unknown_aspects(ptr @.str.int, i32 %arg) + ret void +} + +declare void @unknown_aspects(ptr, ...) #2 + +;; The call has no arguments to check, so the "float" aspect is not needed and +;; the call is transformed. +define void @test_no_args_to_check() { +; CHECK-LABEL: @test_no_args_to_check( +; CHECK-NEXT: call void (ptr, ...) @float_present_mod(ptr nonnull @.str.noargs) +; CHECK-NEXT: ret void +; + call void (ptr, ...) @no_args_to_check(ptr @.str.noargs) + ret void +} + +declare void @no_args_to_check(ptr, ...) #1 + +;; The first argument index is not 2. The "float" aspect is needed, so no +;; transformation occurs. +define void @test_first_arg_idx(i32 %ignored, double %arg) { +; CHECK-LABEL: @test_first_arg_idx( +; CHECK-NEXT: call void (i32, ptr, ...) @first_arg_idx(i32 [[IGNORED:%.*]], ptr nonnull @.str.float, double [[ARG:%.*]]) +; CHECK-NEXT: ret void +; + call void (i32, ptr, ...) @first_arg_idx(i32 %ignored, ptr @.str.float, double %arg) + ret void +} + +declare void @first_arg_idx(i32, ptr, ...) #3 + +;; One aspect ("unknown") is needed, but one ("float") is not. The call is +;; transformed, and a reference to the needed aspect is emitted. +define void @test_partial_aspects(i32 %arg) { +; CHECK-LABEL: @test_partial_aspects( +; CHECK-NEXT: call void (ptr, ...) @multiple_aspects_mod(ptr nonnull @.str.int, i32 [[ARG:%.*]]) +; CHECK-NEXT: call void @llvm.reloc.none(metadata !"basic_impl_unknown") +; CHECK-NEXT: ret void +; + call void (ptr, ...) @partial_aspects(ptr @.str.int, i32 %arg) + ret void +} + +declare void @partial_aspects(ptr, ...) #4 + +attributes #0 = { "modular-format"="printf,1,2,basic_mod,basic_impl" } +attributes #1 = { "modular-format"="printf,1,2,float_present_mod,basic_impl,float" } +attributes #2 = { "modular-format"="printf,1,2,unknown_aspects_mod,basic_impl,unknown1,unknown2" } +attributes #3 = { "modular-format"="printf,2,3,first_arg_idx_mod,basic_impl,float" } +attributes #4 = { "modular-format"="printf,1,2,multiple_aspects_mod,basic_impl,float,unknown" } diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll new file mode 100644 index 0000000000000..d447a39aafd93 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll @@ -0,0 +1,761 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +define void @test_stores_noalias_via_rt_checks_after_loads(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_stores_noalias_via_rt_checks_after_loads( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP18]], splat (i32 5) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] +; CHECK: [[PRED_STORE_IF8]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE9]] +; CHECK: [[PRED_STORE_CONTINUE9]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP30:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE9]] ], [ [[TMP29]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13:.*]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP33]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ [[TMP30]], %[[PRED_LOAD_CONTINUE11]] ], [ [[TMP34]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], splat (i32 10) +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] +; CHECK: [[PRED_STORE_IF14]]: +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: store i32 [[TMP39]], ptr [[TMP38]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE15]] +; CHECK: [[PRED_STORE_CONTINUE15]]: +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_IF16]]: +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1 +; CHECK-NEXT: store i32 [[TMP42]], ptr [[TMP41]], align 4, !alias.scope [[META5]], !noalias [[META7]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_CONTINUE17]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_aliasing_store(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_aliasing_store( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE21:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: store i32 99, ptr [[TMP10]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: store i32 99, ptr [[TMP15]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP18]], splat (i32 5) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] +; CHECK: [[PRED_STORE_IF12]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; CHECK: [[PRED_STORE_CONTINUE13]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF14:.*]], label %[[PRED_LOAD_CONTINUE15:.*]] +; CHECK: [[PRED_LOAD_IF14]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP28]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE15]] +; CHECK: [[PRED_LOAD_CONTINUE15]]: +; CHECK-NEXT: [[TMP30:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE13]] ], [ [[TMP29]], %[[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF16:.*]], label %[[PRED_LOAD_CONTINUE17:.*]] +; CHECK: [[PRED_LOAD_IF16]]: +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4, !alias.scope [[META15]], !noalias [[META17]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP33]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_CONTINUE17]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ [[TMP30]], %[[PRED_LOAD_CONTINUE15]] ], [ [[TMP34]], %[[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], splat (i32 10) +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] +; CHECK: [[PRED_STORE_IF18]]: +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP36]], i32 0 +; CHECK-NEXT: store i32 [[TMP39]], ptr [[TMP38]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE19]] +; CHECK: [[PRED_STORE_CONTINUE19]]: +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21]] +; CHECK: [[PRED_STORE_IF20]]: +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP36]], i32 1 +; CHECK-NEXT: store i32 [[TMP42]], ptr [[TMP41]], align 4, !alias.scope [[META19]], !noalias [[META12]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE21]] +; CHECK: [[PRED_STORE_CONTINUE21]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + store i32 99, ptr %gep.src.else, align 4 + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_noalias_store_via_runtime_checks(ptr %dst, ptr %dst.1, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test_noalias_store_via_runtime_checks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[DST_1:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: [[BOUND011:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND112:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; CHECK-NEXT: [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META22:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP4]] +; CHECK-NEXT: store i32 10, ptr [[TMP10]], align 4, !alias.scope [[META25:![0-9]+]], !noalias [[META27:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !alias.scope [[META30:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_LOAD_IF19:.*]], label %[[PRED_LOAD_CONTINUE20:.*]] +; CHECK: [[PRED_LOAD_IF19]]: +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP5]] +; CHECK-NEXT: store i32 10, ptr [[TMP16]], align 4, !alias.scope [[META25]], !noalias [[META27]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE20]] +; CHECK: [[PRED_LOAD_CONTINUE20]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP21:%.*]] = sub <2 x i32> [[TMP20]], splat (i32 5) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4, !alias.scope [[META31:![0-9]+]], !noalias [[META32:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP21]], i32 1 +; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP26]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_LOAD_IF23:.*]], label %[[PRED_LOAD_CONTINUE24:.*]] +; CHECK: [[PRED_LOAD_IF23]]: +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP30]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE24]] +; CHECK: [[PRED_LOAD_CONTINUE24]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, %[[PRED_STORE_CONTINUE22]] ], [ [[TMP31]], %[[PRED_LOAD_IF23]] ] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP33]], label %[[PRED_LOAD_IF25:.*]], label %[[PRED_LOAD_CONTINUE26:.*]] +; CHECK: [[PRED_LOAD_IF25]]: +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4, !alias.scope [[META30]] +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP35]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE26]] +; CHECK: [[PRED_LOAD_CONTINUE26]]: +; CHECK-NEXT: [[TMP37:%.*]] = phi <2 x i32> [ [[TMP32]], %[[PRED_LOAD_CONTINUE24]] ], [ [[TMP36]], %[[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], splat (i32 10) +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP39]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; CHECK: [[PRED_STORE_IF27]]: +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 +; CHECK-NEXT: store i32 [[TMP41]], ptr [[TMP40]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; CHECK: [[PRED_STORE_CONTINUE28]]: +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30]] +; CHECK: [[PRED_STORE_IF29]]: +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1 +; CHECK-NEXT: store i32 [[TMP44]], ptr [[TMP43]], align 4, !alias.scope [[META31]], !noalias [[META32]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]] +; CHECK: [[PRED_STORE_CONTINUE30]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP45:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %add = add i32 %l.src.then, 10 + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst, align 4 + br label %loop.latch + +else: + %gep.dst.1.else = getelementptr inbounds i32, ptr %dst.1, i32 %iv + store i32 10, ptr %gep.dst.1.else, align 4 + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %sub = sub i32 %l.src.else, 5 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %sub, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_memory_op_between_loads_alias(ptr %dst, ptr %src, ptr %cond, ptr %dst.1, i32 %n) { +; CHECK-LABEL: define void @test_memory_op_between_loads_alias( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], ptr [[DST_1:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[BOUND06:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND17:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT8:%.*]] = and i1 [[BOUND06]], [[BOUND17]] +; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT8]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META35:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4, !alias.scope [[META38:![0-9]+]], !noalias [[META40:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: store i32 0, ptr [[TMP15]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP18]], splat (i32 10) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] +; CHECK: [[PRED_STORE_IF12]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; CHECK: [[PRED_STORE_CONTINUE13]]: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] +; CHECK: [[PRED_STORE_IF14]]: +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE15]] +; CHECK: [[PRED_STORE_CONTINUE15]]: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_IF16]]: +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !alias.scope [[META38]], !noalias [[META40]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: store i32 [[TMP32]], ptr [[TMP33]], align 4, !alias.scope [[META42]], !noalias [[META35]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE17]] +; CHECK: [[PRED_STORE_CONTINUE17]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %middle + +middle: + %gep.src.middle = getelementptr inbounds i32, ptr %src, i32 %iv + store i32 0, ptr %gep.src.middle, align 4 + br label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %gep.dst.then = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %l.src.then, ptr %gep.dst.then, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.else, 10 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_memory_op_between_loads_no_alias_via_rt_checks(ptr %dst, ptr %src, ptr %cond, ptr %dst.1, i32 %n) { +; CHECK-LABEL: define void @test_memory_op_between_loads_no_alias_via_rt_checks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], ptr [[DST_1:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: [[BOUND011:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND112:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; CHECK-NEXT: [[CONFLICT_RDX14:%.*]] = or i1 [[CONFLICT_RDX10]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]] +; CHECK-NEXT: [[CONFLICT_RDX18:%.*]] = or i1 [[CONFLICT_RDX14]], [[FOUND_CONFLICT17]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE26:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META45:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP4]] +; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4, !alias.scope [[META48:![0-9]+]], !noalias [[META50:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !alias.scope [[META53:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_LOAD_IF19:.*]], label %[[PRED_LOAD_CONTINUE20:.*]] +; CHECK: [[PRED_LOAD_IF19]]: +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i32 [[TMP5]] +; CHECK-NEXT: store i32 0, ptr [[TMP16]], align 4, !alias.scope [[META48]], !noalias [[META50]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE20]] +; CHECK: [[PRED_LOAD_CONTINUE20]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4, !alias.scope [[META54:![0-9]+]], !noalias [[META55:![0-9]+]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP21]], i32 1 +; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP26]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; CHECK: [[PRED_STORE_IF23]]: +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store i32 [[TMP30]], ptr [[TMP31]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; CHECK: [[PRED_STORE_CONTINUE24]]: +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26]] +; CHECK: [[PRED_STORE_IF25]]: +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !alias.scope [[META53]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]] +; CHECK-NEXT: store i32 [[TMP34]], ptr [[TMP35]], align 4, !alias.scope [[META54]], !noalias [[META55]] +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; CHECK: [[PRED_STORE_CONTINUE26]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP56:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %c = load i32, ptr %gep.cond, align 4 + %c.0 = icmp ule i32 %c, 11 + br i1 %c.0, label %then, label %middle + +middle: + %gep.dst.1 = getelementptr inbounds i32, ptr %dst.1, i32 %iv + store i32 0, ptr %gep.dst.1, align 4 + br label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.then = load i32, ptr %gep.src.then, align 4 + %gep.dst.then = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %l.src.then, ptr %gep.dst.then, align 4 + br label %loop.latch + +else: + %gep.src.else = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.else = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.else, 10 + %gep.dst.else = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %add, ptr %gep.dst.else, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll new file mode 100644 index 0000000000000..b30d010aaf9c9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads.ll @@ -0,0 +1,1291 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s + +define void @test(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP34:%.*]] = xor <2 x i1> [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP34]], i32 0 +; CHECK-NEXT: br i1 [[TMP35]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META3:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP34]], i32 1 +; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP36:%.*]] = phi <2 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP25:%.*]] = add <2 x i32> [[TMP36]], splat (i32 10) +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP31]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP33]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP29]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP37]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Different addresses - should NOT hoist +define void @different_addresses(ptr %dst, ptr %src1, ptr %src2, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_addresses( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC15:%.*]] = ptrtoint ptr [[SRC1]] to i64 +; CHECK-NEXT: [[SRC23:%.*]] = ptrtoint ptr [[SRC2]] to i64 +; CHECK-NEXT: [[COND2:%.*]] = ptrtoint ptr [[COND]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[COND2]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[SRC23]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = icmp ult i64 [[TMP1]], 8 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DST1]], [[SRC15]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = icmp ult i64 [[TMP2]], 8 +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE13:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP29]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i32> [ [[TMP17]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP23:%.*]] = add <2 x i32> [[TMP22]], splat (i32 10) +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11:.*]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP49]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP24]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP30]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP52]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP26]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ [[TMP32]], %[[PRED_LOAD_CONTINUE11]] ], [ [[TMP27]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP33]], <2 x i32> [[TMP23]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP34]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP60]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src.1 = getelementptr inbounds i32, ptr %src1, i32 %iv + %gep.src.2 = getelementptr inbounds i32, ptr %src2, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src.1, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src.2, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Non-complementary masks - should NOT hoist +define void @non_complementary_masks(ptr %dst, ptr %src, ptr %cond1, ptr %cond2, i32 %n) { +; CHECK-LABEL: define void @non_complementary_masks( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND1:%.*]], ptr [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[COND2]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[COND2]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] +; CHECK-NEXT: [[BOUND07:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]] +; CHECK-NEXT: [[BOUND18:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]] +; CHECK-NEXT: [[CONFLICT_RDX10:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT9]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE17:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[COND1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[COND2]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META14:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP38:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD11]], splat (i32 20) +; CHECK-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[TMP37]], splat (i1 true) +; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP38]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP19]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META19:![0-9]+]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP22]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP19]], i32 1 +; CHECK-NEXT: br i1 [[TMP39]], label %[[PRED_LOAD_IF12:.*]], label %[[PRED_LOAD_CONTINUE13:.*]] +; CHECK: [[PRED_LOAD_IF12]]: +; CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP40]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE13]] +; CHECK: [[PRED_LOAD_CONTINUE13]]: +; CHECK-NEXT: [[TMP27:%.*]] = phi <2 x i32> [ [[TMP23]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP26]], %[[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP28:%.*]] = add <2 x i32> [[TMP27]], splat (i32 10) +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0 +; CHECK-NEXT: br i1 [[TMP33]], label %[[PRED_LOAD_IF14:.*]], label %[[PRED_LOAD_CONTINUE15:.*]] +; CHECK: [[PRED_LOAD_IF14]]: +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE15]] +; CHECK: [[PRED_LOAD_CONTINUE15]]: +; CHECK-NEXT: [[TMP36:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE13]] ], [ [[TMP34]], %[[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1 +; CHECK-NEXT: br i1 [[TMP35]], label %[[PRED_LOAD_IF16:.*]], label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_IF16]]: +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META19]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> [[TMP36]], i32 [[TMP30]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE17]] +; CHECK: [[PRED_LOAD_CONTINUE17]]: +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x i32> [ [[TMP36]], %[[PRED_LOAD_CONTINUE15]] ], [ [[TMP31]], %[[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP19]], <2 x i32> [[TMP28]], <2 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI18:%.*]] = select <2 x i1> [[TMP37]], <2 x i32> [[TMP32]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI18]], ptr [[TMP41]], align 4, !alias.scope [[META21:![0-9]+]], !noalias [[META23:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP63:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP63]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond1 = getelementptr inbounds i32, ptr %cond1, i32 %iv + %gep.cond2 = getelementptr inbounds i32, ptr %cond2, i32 %iv + %l.c1 = load i32, ptr %gep.cond1 + %l.c2 = load i32, ptr %gep.cond2 + %c1 = icmp ule i32 %l.c1, 11 + %c2 = icmp ule i32 %l.c2, 20 + br i1 %c1, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + br i1 %c2, label %else.then, label %loop.latch + +else.then: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else.then ], [ 0, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Different access sizes - should NOT hoist +; Both loads use the same pointer but have different types (i8 vs i32) +define void @different_access_sizes(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_access_sizes( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x ptr> [[TMP8]], ptr [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META26:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP11]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4, !alias.scope [[META29:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP14]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP15]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP7]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP16]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr [[TMP6]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i8> poison, i8 [[TMP23]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP25:%.*]] = phi <2 x i8> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP24]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP7]], align 4, !alias.scope [[META29]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i8> [[TMP25]], i8 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i8> [ [[TMP25]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32> +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i32> [[TMP30]], <2 x i32> [[TMP21]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4, !alias.scope [[META31:![0-9]+]], !noalias [[META33:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i8, ptr %gep.src, align 4 + %ext = zext i8 %l.src to i32 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %ext, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Positive test: Same address with different alignments - should hoist with minimum alignment +define void @different_alignments_same_address(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @different_alignments_same_address( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META36:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP8]], align 4, !alias.scope [[META39:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP19]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[TMP16]], i32 1 +; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ [[TMP35]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP25:%.*]] = add <2 x i32> [[TMP24]], splat (i32 10) +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP8]], align 2, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP33:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP31]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP9]], align 2, !alias.scope [[META39]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP33]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP29]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP34]], align 4, !alias.scope [[META41:![0-9]+]], !noalias [[META43:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load i32, ptr %gep.src, align 2 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Negative test: Volatile loads - should NOT hoist +define void @volatile_load(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @volatile_load( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] +; CHECK-NEXT: [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[IV]] +; CHECK-NEXT: [[L_C:%.*]] = load i32, ptr [[GEP_COND]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp ule i32 [[L_C]], 11 +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[L_SRC:%.*]] = load volatile i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[L_SRC_2]], 10 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ [[L_SRC]], %[[THEN]] ], [ [[ADD]], %[[ELSE]] ] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %l.src = load volatile i32, ptr %gep.src, align 4 + br label %loop.latch + +else: + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test hoisting with duplicate GEPs: The same address is computed by different +; GEP instructions in different branches. The hoisting pass should use SCEV to +; recognize they compute the same address and hoist the load. +define void @duplicate_gep(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @duplicate_gep( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META46:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META49:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP18]], splat (i32 10) +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP23]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP25]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !alias.scope [[META49]] +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP27]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP24]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP28]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP29]], <2 x i32> [[TMP19]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4, !alias.scope [[META51:![0-9]+]], !noalias [[META53:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i32, ptr %src, i32 %iv + %l.src = load i32, ptr %gep.src.then, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i32, ptr %src, i32 %iv + %l.src.2 = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; Test with non-unit-stride loads: Loads have stride 16 (2 doubles * 8 bytes) +; instead of unit stride (8 bytes). The hoisting optimization should still work +; since both loads access the same address with the same stride. +define void @non_unit_stride_i64(ptr %dst, ptr %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @non_unit_stride_i64( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 4 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE11:.*]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP6]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META56:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !alias.scope [[META59:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP13]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP14]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_LOAD_IF6:.*]], label %[[PRED_LOAD_CONTINUE7:.*]] +; CHECK: [[PRED_LOAD_IF6]]: +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP18]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE7]] +; CHECK: [[PRED_LOAD_CONTINUE7]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i32> [ [[TMP15]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], %[[PRED_LOAD_IF6]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i32> [[TMP20]], splat (i32 10) +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 +; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_LOAD_IF8:.*]], label %[[PRED_LOAD_CONTINUE9:.*]] +; CHECK: [[PRED_LOAD_IF8]]: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP24]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE9]] +; CHECK: [[PRED_LOAD_CONTINUE9]]: +; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE7]] ], [ [[TMP25]], %[[PRED_LOAD_IF8]] ] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 +; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_LOAD_IF10:.*]], label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_IF10]]: +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4, !alias.scope [[META59]] +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> [[TMP26]], i32 [[TMP29]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE11]] +; CHECK: [[PRED_LOAD_CONTINUE11]]: +; CHECK-NEXT: [[TMP31:%.*]] = phi <2 x i32> [ [[TMP26]], %[[PRED_LOAD_CONTINUE9]] ], [ [[TMP30]], %[[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP31]], <2 x i32> [[TMP21]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP6]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP32]], align 4, !alias.scope [[META61:![0-9]+]], !noalias [[META63:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP64:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %else + +then: + %gep.src.then = getelementptr inbounds i64, ptr %src, i32 %iv + %l.src = load i32, ptr %gep.src.then, align 4 + br label %loop.latch + +else: + %gep.src.else= getelementptr inbounds i64, ptr %src, i32 %iv + %l.src.2 = load i32, ptr %gep.src.else, align 4 + %add = add i32 %l.src.2, 10 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ %add, %else ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + + +; Test that loads inside masked regions (without individual masks) are +; correctly detected and hoisted when they have complementary predicates. +define void @hoist_loads_in_masked_regions(ptr noalias %dst, ptr noalias %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @hoist_loads_in_masked_regions( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11) +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD1]], <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]] +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP66:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.c = load i32, ptr %gep.cond + %c = icmp ule i32 %l.c, 11 + br i1 %c, label %then, label %loop.latch + +then: + %l.src = load i32, ptr %gep.src, align 4 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.src, %then ], [ 0, %loop ] + %l.src.2 = load i32, ptr %gep.src, align 4 + %add = add i32 %l.src.2, %merge + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} +; Test that when there are 3 or more regions with complementary predicates +; loading from the same address, all loads are hoisted and replaced, not just +; the first pair. This tests the K loop that continues searching after finding +; the initial complementary pair. +define void @hoist_multiple_complementary_loads(ptr noalias %dst, ptr noalias %src, ptr %cond, i32 %n) { +; CHECK-LABEL: define void @hoist_multiple_complementary_loads( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], ptr [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE10:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP0]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP43]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP64]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP12]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; CHECK: [[PRED_LOAD_IF1]]: +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP70]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_CONTINUE2]]: +; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP28]], splat (i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 32) +; CHECK-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[TMP16]], splat (i1 true) +; CHECK-NEXT: [[TMP32:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP29]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP32]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP33]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[TMP32]], i32 1 +; CHECK-NEXT: br i1 [[TMP24]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP26]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP23]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP22:%.*]] = mul <2 x i32> [[TMP18]], splat (i32 2) +; CHECK-NEXT: [[TMP30:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP30]], i32 0 +; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8:.*]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP61]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP35:%.*]] = phi <2 x i32> [ poison, %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP38]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP30]], i32 1 +; CHECK-NEXT: br i1 [[TMP36]], label %[[PRED_LOAD_IF9:.*]], label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_IF9]]: +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP65]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[TMP37]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_CONTINUE10]]: +; CHECK-NEXT: [[TMP45:%.*]] = phi <2 x i32> [ [[TMP35]], %[[PRED_LOAD_CONTINUE8]] ], [ [[TMP44]], %[[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP32]], <2 x i32> [[TMP22]], <2 x i32> [[TMP15]] +; CHECK-NEXT: [[TMP42:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[TMP45]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i64 32 +; CHECK-NEXT: store <2 x i32> [[TMP42]], ptr [[TMP40]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP68:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.cond = getelementptr inbounds i32, ptr %cond, i32 %iv + %l.cond = load i32, ptr %gep.cond, align 4 + %c.1 = icmp ne i32 %l.cond, 0 + br i1 %c.1, label %check2, label %region3 + +check2: + %c.2 = icmp ne i32 %l.cond, 32 + br i1 %c.2, label %region1, label %region2 + +region1: + %gep.src.8.r1 = getelementptr inbounds i8, ptr %src, i32 %iv + %val1 = load i32, ptr %gep.src.8.r1, align 4 + br label %loop.latch + +region2: + %gep.src.8.r2 = getelementptr inbounds i8, ptr %src, i32 %iv + %val2 = load i32, ptr %gep.src.8.r2, align 4 + %mul = mul i32 %val2, 2 + br label %loop.latch + +region3: + %gep.src.8.r3 = getelementptr inbounds i8, ptr %src, i32 %iv + %val3 = load i32, ptr %gep.src.8.r3, align 4 + %add = add i32 %val3, 1 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %val1, %region1 ], [ %mul, %region2 ], [ %add, %region3 ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv + %offset.dst = getelementptr inbounds i8, ptr %gep.dst, i64 32 + store i32 %merge, ptr %offset.dst, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @hoist_predicated_load_with_chained_geps1(ptr %dst, ptr %src, i1 %cond) { +; CHECK-LABEL: define void @hoist_predicated_load_with_chained_geps1( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 2210 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2, !alias.scope [[META70:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP11]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i64 8 +; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[TMP16]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP17]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP19:%.*]] = phi <2 x i16> [ poison, %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP18]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META70]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i16> [[TMP19]], i16 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i16> [ [[TMP19]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP23]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], <2 x i16> [[TMP24]], <2 x i16> [[TMP14]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 +; CHECK-NEXT: store i16 [[TMP25]], ptr [[DST]], align 2, !alias.scope [[META73:![0-9]+]], !noalias [[META70]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP75:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %cond, label %then, label %else + +then: + %gep1 = getelementptr [11 x i16], ptr %src, i64 %iv + %gep2 = getelementptr i8, ptr %gep1, i64 8 + %l.0 = load i16, ptr %gep2, align 2 + br label %loop.latch + +else: + %gep3 = getelementptr [11 x i16], ptr %src, i64 %iv + %gep4 = getelementptr i8, ptr %gep3, i64 8 + %l.1 = load i16, ptr %gep4, align 2 + br label %loop.latch + +loop.latch: + %merge = phi i16 [ %l.0, %then ], [ %l.1, %else ] + store i16 %merge, ptr %dst, align 2 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +define void @hoist_predicated_load_with_chained_geps2(ptr %dst, ptr %src, i1 %cond) { +; CHECK-LABEL: define void @hoist_predicated_load_with_chained_geps2( +; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 2210 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE8:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [11 x i16], ptr [[SRC]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[TMP8]], align 2, !alias.scope [[META77:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP13]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP14]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i16> [ [[TMP11]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[TMP17]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> poison, i16 [[TMP18]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ poison, %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP19]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br i1 [[COND]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP21]], align 2, !alias.scope [[META77]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i16> [[TMP20]], i16 [[TMP22]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i16> [ [[TMP20]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP23]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[COND]], <2 x i16> [[TMP24]], <2 x i16> [[TMP16]] +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1 +; CHECK-NEXT: store i16 [[TMP25]], ptr [[DST]], align 2, !alias.scope [[META80:![0-9]+]], !noalias [[META77]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP82:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep1 = getelementptr [11 x i16], ptr %src, i64 %iv + br i1 %cond, label %then, label %else + +then: + %gep2 = getelementptr i8, ptr %gep1, i64 8 + %l.0 = load i16, ptr %gep2, align 2 + br label %loop.latch + +else: + %gep3 = getelementptr i8, ptr %gep1, i64 8 + %l.1 = load i16, ptr %gep3, align 2 + br label %loop.latch + +loop.latch: + %merge = phi i16 [ %l.0, %then ], [ %l.1, %else ] + store i16 %merge, ptr %dst, align 2 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 100 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll index b29834f9fe960..6e543b8c87fc7 100644 --- a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll +++ b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt < %s -passes='require,chr' -S | FileCheck %s declare void @foo() @@ -14,21 +14,21 @@ define void @test_chr_with_lifetimes(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = select i1 true, i1 [[TMP9]], i1 false +; CHECK-NEXT: [[TMP8:%.*]] = select i1 true, i1 [[TMP9]], i1 false, !prof [[PROF15:![0-9]+]] ; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP8]], i1 [[TMP11]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF16:![0-9]+]] ; CHECK: entry.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17:![0-9]+]] ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17:![0-9]+]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18:![0-9]+]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] ; CHECK: entry.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] @@ -83,24 +83,24 @@ define void @test_chr_dynamic_alloca(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false +; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false, !prof [[PROF15]] ; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF16]] ; CHECK: bb4.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: [[TEST:%.*]] = alloca i32, align 8 ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: store ptr [[TEST]], ptr [[I]], align 8 ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb4.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: [[TEST_NONCHR:%.*]] = alloca i32, align 8 ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: store ptr [[TEST_NONCHR]], ptr [[I]], align 8 @@ -167,21 +167,21 @@ define void @test_no_move_allocas(ptr %i) !prof !14 { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false +; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false, !prof [[PROF15]] ; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false, !prof [[PROF15]] +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF16]] ; CHECK: entry.split: -; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP6]]) -; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]] +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF18]] ; CHECK: bb0: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] ; CHECK: entry.split.nonchr: -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF17]] ; CHECK-NEXT: call void @baz(i64 [[TMP7]]) -; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF17]] ; CHECK: bb0.nonchr: ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB1]] @@ -242,4 +242,26 @@ bb3: !14 = !{!"function_entry_count", i64 100} !15 = !{!"branch_weights", i32 0, i32 1} -; CHECK: !15 = !{!"branch_weights", i32 1000, i32 0} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 1, !"ProfileSummary", [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]]} +; CHECK: [[META2]] = !{!"ProfileFormat", !"InstrProf"} +; CHECK: [[META3]] = !{!"TotalCount", i64 10000} +; CHECK: [[META4]] = !{!"MaxCount", i64 10} +; CHECK: [[META5]] = !{!"MaxInternalCount", i64 1} +; CHECK: [[META6]] = !{!"MaxFunctionCount", i64 1000} +; CHECK: [[META7]] = !{!"NumCounts", i64 3} +; CHECK: [[META8]] = !{!"NumFunctions", i64 3} +; CHECK: [[META9]] = !{!"DetailedSummary", [[META10:![0-9]+]]} +; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]]} +; CHECK: [[META11]] = !{i32 10000, i64 100, i32 1} +; CHECK: [[META12]] = !{i32 999000, i64 100, i32 1} +; CHECK: [[META13]] = !{i32 999999, i64 1, i32 2} +; CHECK: [[META14:![0-9]+]] = !{!"function_entry_count", i64 100} +; CHECK: [[PROF15]] = !{!"unknown", !"chr"} +; CHECK: [[PROF16]] = !{!"branch_weights", i32 1000, i32 0} +; CHECK: [[PROF17]] = !{!"branch_weights", i32 1, i32 0} +; CHECK: [[PROF18]] = !{!"branch_weights", i32 0, i32 1} +;. diff --git a/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll b/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll index 3b059fd7d8800..9c5f046af47af 100644 --- a/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll +++ b/llvm/test/Transforms/PGOProfile/prof-verify-no-entrycount.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; Test prof-verify for functions without entry count ; RUN: not opt -passes=prof-verify %s -o - 2>&1 | FileCheck %s diff --git a/llvm/test/Transforms/PGOProfile/prof-verify.ll b/llvm/test/Transforms/PGOProfile/prof-verify.ll index 50159506e8313..75d1e6a3db571 100644 --- a/llvm/test/Transforms/PGOProfile/prof-verify.ll +++ b/llvm/test/Transforms/PGOProfile/prof-verify.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; Test prof-inject and prof-verify ; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s --check-prefix=INJECT diff --git a/llvm/test/Transforms/PGOProfile/profcheck-select.ll b/llvm/test/Transforms/PGOProfile/profcheck-select.ll index b5dc97d2d5a6d..74bcb3f52428b 100644 --- a/llvm/test/Transforms/PGOProfile/profcheck-select.ll +++ b/llvm/test/Transforms/PGOProfile/profcheck-select.ll @@ -1,3 +1,7 @@ +; This test fails under the profcheck configuration due to profcheck creating +; metadata. +; UNSUPPORTED: profcheck + ; RUN: split-file %s %t ; RUN: opt -passes=prof-inject %t/inject.ll -S -o - | FileCheck %t/inject.ll diff --git a/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll new file mode 100644 index 0000000000000..d4aef24962313 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/copyable-used-outside-with-immediate-op.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-9999 < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ , %[[BB]] ], [ [[TMP6:%.*]], %[[BB14:.*]] ], [ , %[[BB10:.*]] ] +; CHECK-NEXT: br label %[[BB3:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ zeroinitializer, %[[BB1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ [[TMP0]], %[[BB1]] ] +; CHECK-NEXT: br label %[[BB10]] +; CHECK: [[BB10]]: +; CHECK-NEXT: [[PHI12:%.*]] = phi float [ 0.000000e+00, %[[BB3]] ], [ 0.000000e+00, %[[BB14]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ , %[[BB3]] ], [ [[TMP7:%.*]], %[[BB14]] ] +; CHECK-NEXT: switch i32 0, label %[[BB14]] [ +; CHECK-NEXT: i32 0, label %[[BB1]] +; CHECK-NEXT: ] +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB10]] +; +bb: + br label %bb1 + +bb1: + %phi = phi i32 [ 0, %bb ], [ %or16, %bb14 ], [ 0, %bb10 ] + %phi2 = phi i32 [ 0, %bb ], [ %or15, %bb14 ], [ 0, %bb10 ] + br label %bb3 + +bb3: ; preds = %bb1 + %phi4 = phi i32 [ poison, %bb1 ] + %phi6 = phi i32 [ poison, %bb1 ] + %phi7 = phi i32 [ %phi, %bb1 ] + %phi9 = phi i32 [ %phi2, %bb1 ] + %0 = phi <2 x float> [ zeroinitializer, %bb1 ] + br label %bb10 + +bb10: + %phi11 = phi i32 [ 0, %bb3 ], [ %phi11, %bb14 ] + %phi12 = phi float [ 0.000000e+00, %bb3 ], [ 0.000000e+00, %bb14 ] + %phi13 = phi i32 [ 0, %bb3 ], [ %or15, %bb14 ] + switch i32 0, label %bb14 [ + i32 0, label %bb1 + ] + +bb14: + %or = or i32 %phi13, %phi11 + %or15 = or i32 %or, 0 + %or16 = or i32 %phi11, 0 + br i1 false, label %bb1, label %bb10 +} diff --git a/llvm/test/Verifier/modular-format.ll b/llvm/test/Verifier/modular-format.ll new file mode 100644 index 0000000000000..abdd73d098be1 --- /dev/null +++ b/llvm/test/Verifier/modular-format.ll @@ -0,0 +1,41 @@ +; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s + +define void @test_too_few_arguments(i32 %arg, ...) "modular-format"="printf,1,2,basic_mod" { + ret void +} +; CHECK: modular-format attribute requires at least 5 arguments +; CHECK-NEXT: ptr @test_too_few_arguments + +define void @test_first_arg_index_not_integer(i32 %arg, ...) "modular-format"="printf,1,foo,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is not an integer +; CHECK-NEXT: ptr @test_first_arg_index_not_integer + +define void @test_first_arg_index_zero(i32 %arg) "modular-format"="printf,1,0,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_zero + +define void @test_first_arg_index_out_of_bounds(i32 %arg) "modular-format"="printf,1,2,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_out_of_bounds + +define void @test_first_arg_index_out_of_bounds_varargs(i32 %arg, ...) "modular-format"="printf,1,3,basic_mod,basic_impl" { + ret void +} +; CHECK: modular-format attribute first arg index is out of bounds +; CHECK-NEXT: ptr @test_first_arg_index_out_of_bounds_varargs + +; CHECK-NOT: ptr @test_first_arg_index_in_bounds +define void @test_first_arg_index_in_bounds(i32 %arg) "modular-format"="printf,1,1,basic_mod,basic_impl" { + ret void +} + +; CHECK-NOT: ptr @test_first_arg_index_in_bounds_varargs +define void @test_first_arg_index_in_bounds_varargs(i32 %arg, ...) "modular-format"="printf,1,2,basic_mod,basic_impl" { + ret void +} diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 725ddb877f9ec..94cf8bc358514 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -69,6 +69,8 @@ # profiling doesn't work quite well on GPU, excluding config.excludes.append("AMDGPU") + config.available_features.add("profcheck") + # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index f515ee651d835..8d50df7050636 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9 -; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> +; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> +; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<66>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> ; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33 ; CHECK-NEXT: t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3> ; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24 diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp index 3e803691cfd1f..df8284d7be66a 100644 --- a/llvm/unittests/TargetParser/TripleTest.cpp +++ b/llvm/unittests/TargetParser/TripleTest.cpp @@ -2630,6 +2630,17 @@ TEST(TripleTest, isMacOSVersionLT) { EXPECT_FALSE(T.isMacOSXVersionLT(10, 15, 0)); } +TEST(TripleTest, isMacOSVersionGE) { + Triple T = Triple("x86_64-apple-macos11"); + EXPECT_FALSE(T.isMacOSXVersionGE(11, 1, 0)); + EXPECT_TRUE(T.isMacOSXVersionGE(10, 15, 0)); + + T = Triple("x86_64-apple-darwin20"); + EXPECT_FALSE(T.isMacOSXVersionGE(11, 1, 0)); + EXPECT_TRUE(T.isMacOSXVersionGE(11, 0, 0)); + EXPECT_TRUE(T.isMacOSXVersionGE(10, 15, 0)); +} + TEST(TripleTest, CanonicalizeOSVersion) { EXPECT_EQ(VersionTuple(10, 15, 4), Triple::getCanonicalVersionForOS(Triple::MacOSX, diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn index f8c4838ab7ee3..bd225dd1e5656 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn @@ -52,6 +52,7 @@ static_library("clangd") { "//clang/lib/Frontend", "//clang/lib/Index", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Sema", "//clang/lib/Serialization", "//clang/lib/Tooling", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn index 1d01bd8e87d37..8f1c1d8bb0ed7 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/modularize/BUILD.gn @@ -6,6 +6,7 @@ executable("modularize") { "//clang/lib/Driver", "//clang/lib/Frontend", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Serialization", "//clang/lib/Tooling", "//llvm/lib/Option", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn index 2c0eac074d355..5acee9dd52518 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/pp-trace/BUILD.gn @@ -5,6 +5,7 @@ executable("pp-trace") { "//clang/lib/Basic", "//clang/lib/Frontend", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Serialization", "//clang/lib/Tooling", "//llvm/lib/Support", diff --git a/llvm/utils/gn/secondary/clang/include/clang/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Options/BUILD.gn similarity index 100% rename from llvm/utils/gn/secondary/clang/include/clang/Driver/BUILD.gn rename to llvm/utils/gn/secondary/clang/include/clang/Options/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn index e1f31179b9a20..66dbf6152472a 100644 --- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn @@ -14,6 +14,7 @@ static_library("Driver") { # See the review thread of r311958 for details. "//clang/include/clang/StaticAnalyzer/Checkers", "//clang/lib/Basic", + "//clang/lib/Options", "//llvm/include/llvm/Config:llvm-config", "//llvm/lib/BinaryFormat", "//llvm/lib/Option", @@ -21,10 +22,6 @@ static_library("Driver") { "//llvm/lib/TargetParser", "//llvm/lib/WindowsDriver", ] - public_deps = [ - # public_dep because public header Options.h includes generated Options.inc. - "//clang/include/clang/Driver:Options", - ] if (host_os == "win") { # MSVCToolChain.cpp uses version.dll. libs = [ "version.lib" ] @@ -34,12 +31,10 @@ static_library("Driver") { "Compilation.cpp", "Distro.cpp", "Driver.cpp", - "DriverOptions.cpp", "Job.cpp", "Multilib.cpp", "MultilibBuilder.cpp", "OffloadBundler.cpp", - "OptionUtils.cpp", "Phases.cpp", "SanitizerArgs.cpp", "Tool.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn index 948d1405676b7..4009cfc609f4a 100644 --- a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn @@ -9,6 +9,7 @@ static_library("Frontend") { "//clang/lib/Driver", "//clang/lib/Edit", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Parse", "//clang/lib/Sema", "//clang/lib/Serialization", diff --git a/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn index 9f7140ed391dd..707eabf1af70b 100644 --- a/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn @@ -11,6 +11,7 @@ static_library("FrontendTool") { "//clang/lib/ExtractAPI", "//clang/lib/Frontend", "//clang/lib/Frontend/Rewrite", + "//clang/lib/Options", "//llvm/lib/Option", "//llvm/lib/Support", ] diff --git a/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn new file mode 100644 index 0000000000000..3f022ed7f7480 --- /dev/null +++ b/llvm/utils/gn/secondary/clang/lib/Options/BUILD.gn @@ -0,0 +1,19 @@ +static_library("Options") { + output_name = "clangOptions" + configs += [ "//llvm/utils/gn/build:clang_code" ] + include_dirs = [ "." ] + deps = [ + "//clang/include/clang/Config", + "//clang/lib/Basic", + "//llvm/lib/Option", + "//llvm/lib/Support", + ] + public_deps = [ + # public_dep because public header Options.h includes generated Options.inc. + "//clang/include/clang/Options:Options", + ] + sources = [ + "DriverOptions.cpp", + "OptionUtils.cpp", + ] +} diff --git a/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn index 0087aad982672..5ec5500010b39 100644 --- a/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn @@ -2,7 +2,7 @@ static_library("Tooling") { output_name = "clangTooling" configs += [ "//llvm/utils/gn/build:clang_code" ] deps = [ - "//clang/include/clang/Driver:Options", + "//clang/include/clang/Options:Options", "//clang/lib/AST", "//clang/lib/ASTMatchers", "//clang/lib/Basic", @@ -10,6 +10,7 @@ static_library("Tooling") { "//clang/lib/Format", "//clang/lib/Frontend", "//clang/lib/Lex", + "//clang/lib/Options", "//clang/lib/Rewrite", "//clang/lib/Tooling/Core", "//llvm/lib/TargetParser", diff --git a/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn index 39a92f6ac921f..dd796d42038b1 100644 --- a/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/clang-check/BUILD.gn @@ -6,6 +6,7 @@ executable("clang-check") { "//clang/lib/Driver", "//clang/lib/Frontend", "//clang/lib/Frontend/Rewrite", + "//clang/lib/Options", "//clang/lib/StaticAnalyzer/Frontend", "//clang/lib/Tooling", "//clang/lib/Tooling/Syntax", diff --git a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn index 89f0107686f91..774402a9931a9 100644 --- a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn @@ -13,6 +13,7 @@ driver_executable("clang-installapi") { "//clang/lib/Driver", "//clang/lib/Frontend", "//clang/lib/InstallAPI", + "//clang/lib/Options", "//clang/lib/Tooling", "//llvm/lib/Support", "//llvm/lib/TargetParser", diff --git a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn index 54fca3bf1f50b..b402bbba59ca0 100644 --- a/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/driver/BUILD.gn @@ -57,6 +57,7 @@ driver_executable("clang") { "//clang/lib/Frontend", "//clang/lib/FrontendTool", "//clang/lib/Headers", + "//clang/lib/Options", "//clang/tools/clang-linker-wrapper", "//clang/tools/clang-nvlink-wrapper", "//clang/tools/clang-offload-bundler", diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index e377b380685fd..b36466b3fac8b 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1505,7 +1505,6 @@ if (current_toolchain == default_toolchain) { "__type_traits/is_reference.h", "__type_traits/is_reference_wrapper.h", "__type_traits/is_referenceable.h", - "__type_traits/is_replaceable.h", "__type_traits/is_same.h", "__type_traits/is_scalar.h", "__type_traits/is_signed.h", diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn index d8dfcbd3bac2d..854aa296ba161 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn @@ -32,6 +32,7 @@ static_library("JITLink") { "ELF_loongarch.cpp", "ELF_ppc64.cpp", "ELF_riscv.cpp", + "ELF_systemz.cpp", "ELF_x86.cpp", "ELF_x86_64.cpp", "JITLink.cpp", @@ -49,6 +50,7 @@ static_library("JITLink") { "loongarch.cpp", "ppc64.cpp", "riscv.cpp", + "systemz.cpp", "x86.cpp", "x86_64.cpp", ] diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index b1f20a73c3b2b..10a7b62229b8d 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -608,15 +608,8 @@ Transforms/OpenMP/spmdization.ll Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll Transforms/OpenMP/spmdization_remarks.ll Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll -Transforms/PGOProfile/chr-dead-pred.ll -Transforms/PGOProfile/chr-dup-threshold.ll -Transforms/PGOProfile/chr-lifetimes.ll -Transforms/PGOProfile/chr-poison.ll Transforms/PGOProfile/comdat.ll Transforms/PGOProfile/memop_profile_funclet_wasm.ll -Transforms/PGOProfile/profcheck-select.ll -Transforms/PGOProfile/prof-verify.ll -Transforms/PGOProfile/prof-verify-no-entrycount.ll Transforms/PGOProfile/X86/macho.ll Transforms/PhaseOrdering/AArch64/constraint-elimination-placement.ll Transforms/PhaseOrdering/AArch64/globals-aa-required-for-vectorization.ll diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h deleted file mode 100644 index 64a42a228199e..0000000000000 --- a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h +++ /dev/null @@ -1,21 +0,0 @@ -//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===// -// -// Part of the APFloat Project, under the Apache License v2.0 with APFloat -// Exceptions. See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H -#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H - -#include - -namespace mlir { -class Pass; - -#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS -#include "mlir/Conversion/Passes.h.inc" -} // namespace mlir - -#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h index 82bdfd02661a6..40d866ec7bf10 100644 --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -12,7 +12,6 @@ #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h" -#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h" #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h" #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index d5665b439b059..e0cac8b699c30 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -186,21 +186,6 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> { ]; } -//===----------------------------------------------------------------------===// -// ArithToAPFloat -//===----------------------------------------------------------------------===// - -def ArithToAPFloatConversionPass - : Pass<"convert-arith-to-apfloat", "ModuleOp"> { - let summary = "Convert Arith ops to APFloat runtime library calls"; - let description = [{ - This pass converts supported Arith ops to APFloat-based runtime library - calls (APFloatWrappers.cpp). APFloat is a software implementation of - floating-point arithmetic operations. - }]; - let dependentDialects = ["func::FuncDialect"]; -} - //===----------------------------------------------------------------------===// // ArithToSPIRV //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h index 00d50874a2e8d..3576126a487ac 100644 --- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h @@ -60,13 +60,6 @@ mlir::FailureOr> deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp, mlir::ModuleOp moduleOp); -/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name -/// `name`. Return a failure if the FuncOp is found but with a different -/// signature. -FailureOr lookupFnDecl(SymbolOpInterface symTable, StringRef name, - FunctionType funcT, - SymbolTableCollection *symbolTables = nullptr); - } // namespace func } // namespace mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h index b09d32022e348..8ad9ed18acebd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h +++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h @@ -52,10 +52,6 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp, FailureOr lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp, SymbolTableCollection *symbolTables = nullptr); -FailureOr -lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp, - SymbolTableCollection *symbolTables = nullptr); - /// Declares a function to print a C-string. /// If a custom runtime function is defined via `runtimeFunctionName`, it must /// have the signature void(char const*). The default function is `printString`. diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td index e7b44fd94b26d..e2edab44153ca 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td @@ -758,13 +758,16 @@ def FramePointerKindAll : LLVM_EnumAttrCase<"All", "all", "All", 2>; def FramePointerKindReserved : LLVM_EnumAttrCase<"Reserved", "reserved", "Reserved", 3>; +def FramePointerKindNonLeafNoReserve + : LLVM_EnumAttrCase<"NonLeafNoReserve", "non-leaf-no-reserve", "NonLeafNoReserve", 4>; def FramePointerKindEnum : LLVM_EnumAttr< "FramePointerKind", "::llvm::FramePointerKind", "LLVM FramePointerKind", [FramePointerKindNone, FramePointerKindNonLeaf, - FramePointerKindAll, FramePointerKindReserved]> { + FramePointerKindAll, FramePointerKindReserved, + FramePointerKindNonLeafNoReserve]> { let cppNamespace = "::mlir::LLVM::framePointerKind"; } diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h index a57aadcdcc5b0..45626aa280946 100644 --- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h +++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h @@ -219,13 +219,17 @@ bool isLinearizableVector(VectorType type); /// Creates a TransferReadOp from `source`. /// -/// The shape of the vector to read is specified via `inputVectorSizes`. If the -/// shape of the output vector differs from the shape of the value being read, -/// masking is used to avoid out-of-bounds accesses. Set +/// If the shape of vector to read differs from the shape of the value being +/// read, masking is used to avoid out-of-bounds accesses. Set /// `useInBoundsInsteadOfMasking` to `true` to use the "in_bounds" attribute /// instead of explicit masks. /// /// Note: all read offsets are set to 0. +Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, + const VectorType &vecToReadTy, + std::optional padValue = std::nullopt, + bool useInBoundsInsteadOfMasking = false); + Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, ArrayRef inputVectorSizes, std::optional padValue = std::nullopt, diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp deleted file mode 100644 index 01fd5b278aca4..0000000000000 --- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp +++ /dev/null @@ -1,158 +0,0 @@ -//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h" - -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Arith/Transforms/Passes.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Func/Utils/Utils.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/IR/Verifier.h" -#include "mlir/Transforms/WalkPatternRewriteDriver.h" - -namespace mlir { -#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS -#include "mlir/Conversion/Passes.h.inc" -} // namespace mlir - -using namespace mlir; -using namespace mlir::func; - -static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable, - StringRef name, FunctionType funcT, bool setPrivate, - SymbolTableCollection *symbolTables = nullptr) { - OpBuilder::InsertionGuard g(b); - assert(!symTable->getRegion(0).empty() && "expected non-empty region"); - b.setInsertionPointToStart(&symTable->getRegion(0).front()); - FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT); - if (setPrivate) - funcOp.setPrivate(); - if (symbolTables) { - SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable); - symbolTable.insert(funcOp, symTable->getRegion(0).front().begin()); - } - return funcOp; -} - -/// Helper function to look up or create the symbol for a runtime library -/// function for a binary arithmetic operation. -/// -/// Parameter 1: APFloat semantics -/// Parameter 2: Left-hand side operand -/// Parameter 3: Right-hand side operand -/// -/// This function will return a failure if the function is found but has an -/// unexpected signature. -/// -static FailureOr -lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name, - SymbolTableCollection *symbolTables = nullptr) { - auto i32Type = IntegerType::get(symTable->getContext(), 32); - auto i64Type = IntegerType::get(symTable->getContext(), 64); - - std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str(); - FunctionType funcT = - FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type}); - FailureOr func = - lookupFnDecl(symTable, funcName, funcT, symbolTables); - // Failed due to type mismatch. - if (failed(func)) - return func; - // Successfully matched existing decl. - if (*func) - return *func; - - return createFnDecl(b, symTable, funcName, funcT, - /*setPrivate=*/true, symbolTables); -} - -/// Rewrite a binary arithmetic operation to an APFloat function call. -template -struct BinaryArithOpToAPFloatConversion final : OpRewritePattern { - BinaryArithOpToAPFloatConversion(MLIRContext *context, PatternBenefit benefit, - SymbolOpInterface symTable) - : OpRewritePattern(context, benefit), symTable(symTable) {}; - - LogicalResult matchAndRewrite(OpTy op, - PatternRewriter &rewriter) const override { - // Get APFloat function from runtime library. - FailureOr fn = - lookupOrCreateBinaryFn(rewriter, symTable, APFloatName); - if (failed(fn)) - return fn; - - rewriter.setInsertionPoint(op); - // Cast operands to 64-bit integers. - Location loc = op.getLoc(); - auto floatTy = cast(op.getType()); - auto intWType = rewriter.getIntegerType(floatTy.getWidth()); - auto int64Type = rewriter.getI64Type(); - Value lhsBits = arith::ExtUIOp::create( - rewriter, loc, int64Type, - arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs())); - Value rhsBits = arith::ExtUIOp::create( - rewriter, loc, int64Type, - arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs())); - - // Call APFloat function. - int32_t sem = - llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics()); - Value semValue = arith::ConstantOp::create( - rewriter, loc, rewriter.getI32Type(), - rewriter.getIntegerAttr(rewriter.getI32Type(), sem)); - SmallVector params = {semValue, lhsBits, rhsBits}; - auto resultOp = - func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()), - SymbolRefAttr::get(*fn), params); - - // Truncate result to the original width. - Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType, - resultOp->getResult(0)); - rewriter.replaceOp( - op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits)); - return success(); - } - - SymbolOpInterface symTable; -}; - -namespace { -struct ArithToAPFloatConversionPass final - : impl::ArithToAPFloatConversionPassBase { - using Base::Base; - - void runOnOperation() override { - MLIRContext *context = &getContext(); - RewritePatternSet patterns(context); - static const char add[] = "add"; - static const char subtract[] = "subtract"; - static const char multiply[] = "multiply"; - static const char divide[] = "divide"; - static const char remainder[] = "remainder"; - patterns.add, - BinaryArithOpToAPFloatConversion, - BinaryArithOpToAPFloatConversion, - BinaryArithOpToAPFloatConversion, - BinaryArithOpToAPFloatConversion>( - context, 1, getOperation()); - LogicalResult result = success(); - ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) { - if (diag.getSeverity() == DiagnosticSeverity::Error) { - result = failure(); - } - // NB: if you don't return failure, no other diag handlers will fire (see - // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit). - return failure(); - }); - walkAndApplyPatterns(getOperation(), std::move(patterns)); - if (failed(result)) - return signalPassFailure(); - } -}; -} // namespace diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt deleted file mode 100644 index b5ec49c087163..0000000000000 --- a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -add_mlir_conversion_library(MLIRArithToAPFloat - ArithToAPFloat.cpp - - ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM - - DEPENDS - MLIRConversionPassIncGen - - LINK_COMPONENTS - Core - - LINK_LIBS PUBLIC - MLIRArithDialect - MLIRArithTransforms - MLIRFuncDialect - MLIRFuncUtils - ) diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp index f2bacc3399144..b6099902cc337 100644 --- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp +++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp @@ -14,7 +14,6 @@ #include "mlir/Conversion/LLVMCommon/VectorPattern.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Transforms/Passes.h" -#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h" #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/TypeUtilities.h" diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 613dc6d242ceb..bebf1b8fff3f9 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -2,7 +2,6 @@ add_subdirectory(AffineToStandard) add_subdirectory(AMDGPUToROCDL) add_subdirectory(ArithCommon) add_subdirectory(ArithToAMDGPU) -add_subdirectory(ArithToAPFloat) add_subdirectory(ArithToArmSME) add_subdirectory(ArithToEmitC) add_subdirectory(ArithToLLVM) diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index c747e1b59558a..69a317ecd101f 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -1654,20 +1654,6 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern { return failure(); } } - } else if (auto floatTy = dyn_cast(printType)) { - // Print other floating-point types using the APFloat runtime library. - int32_t sem = - llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics()); - Value semValue = LLVM::ConstantOp::create( - rewriter, loc, rewriter.getI32Type(), - rewriter.getIntegerAttr(rewriter.getI32Type(), sem)); - Value floatBits = - LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value); - printer = - LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables); - emitCall(rewriter, loc, printer.value(), - ValueRange({semValue, floatBits})); - return success(); } else { return failure(); } diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp index d6dfd0229963c..b4cb0932ef631 100644 --- a/mlir/lib/Dialect/Func/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp @@ -254,28 +254,3 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp, return std::make_pair(*newFuncOpOrFailure, newCallOp); } - -FailureOr -func::lookupFnDecl(SymbolOpInterface symTable, StringRef name, - FunctionType funcT, SymbolTableCollection *symbolTables) { - FuncOp func; - if (symbolTables) { - func = symbolTables->lookupSymbolIn( - symTable, StringAttr::get(symTable->getContext(), name)); - } else { - func = llvm::dyn_cast_or_null( - SymbolTable::lookupSymbolIn(symTable, name)); - } - - if (!func) - return func; - - mlir::FunctionType foundFuncT = func.getFunctionType(); - // Assert the signature of the found function is same as expected - if (funcT != foundFuncT) { - return func.emitError("matched function '") - << name << "' but with different type: " << foundFuncT - << " (expected " << funcT << ")"; - } - return func; -} diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp index 160b6ae89215c..feaffa34897b6 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp @@ -30,7 +30,6 @@ static constexpr llvm::StringRef kPrintF16 = "printF16"; static constexpr llvm::StringRef kPrintBF16 = "printBF16"; static constexpr llvm::StringRef kPrintF32 = "printF32"; static constexpr llvm::StringRef kPrintF64 = "printF64"; -static constexpr llvm::StringRef kPrintApFloat = "printApFloat"; static constexpr llvm::StringRef kPrintString = "printString"; static constexpr llvm::StringRef kPrintOpen = "printOpen"; static constexpr llvm::StringRef kPrintClose = "printClose"; @@ -161,16 +160,6 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp, LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables); } -FailureOr -mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp, - SymbolTableCollection *symbolTables) { - return lookupOrCreateReservedFn( - b, moduleOp, kPrintApFloat, - {IntegerType::get(moduleOp->getContext(), 32), - IntegerType::get(moduleOp->getContext(), 64)}, - LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables); -} - static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) { return LLVM::LLVMPointerType::get(context); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 19d2d854a3838..dcf84c46949f3 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1890,9 +1890,8 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, // Create masked TransferReadOp. auto maskedRead = vector::createReadOrMaskedRead( - rewriter, loc, packOp.getSource(), readVecType.getShape(), padValue, - useInBoundsInsteadOfMasking, - /*inputScalableVecSizes=*/{}); + rewriter, loc, packOp.getSource(), readVecType, padValue, + useInBoundsInsteadOfMasking); // Create ShapeCastOp. auto shapeCastOp = vector::ShapeCastOp::create( @@ -1977,9 +1976,12 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, } // -- Generate the read operation -- + VectorType readVecType = + VectorType::get(readVectorSizes, unpackTensorType.getElementType(), + readScalableVectorFlags); Value readResult = vector::createReadOrMaskedRead( - rewriter, loc, unpackOp.getSource(), readVectorSizes, std::nullopt, - useInBoundsInsteadOfMasking, readScalableVectorFlags); + rewriter, loc, unpackOp.getSource(), readVecType, std::nullopt, + useInBoundsInsteadOfMasking); // -- Generate the transpose operation -- PackingMetadata packMetadata; @@ -2025,9 +2027,10 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp, .reifyResultShapes(rewriter, reifiedReturnShapes); (void)status; // prevent unused variable warning on non-assert builds assert(succeeded(status) && "failed to reify result shapes"); + auto readType = VectorType::get(inputVectorSizes, padValue.getType()); auto maskedRead = vector::createReadOrMaskedRead( - rewriter, loc, padOp.getSource(), inputVectorSizes, padValue, - /*useInBoundsInsteadOfMasking=*/false, /*inputScalableVecSizes=*/{}); + rewriter, loc, padOp.getSource(), readType, padValue, + /*useInBoundsInsteadOfMasking=*/false); // Create Xfer write Op Value dest = tensor::EmptyOp::create(rewriter, loc, reifiedReturnShapes[0], @@ -2222,9 +2225,9 @@ vectorizeAsLinalgContraction(RewriterBase &rewriter, VectorizationState &state, state.getCanonicalVecType(elemType, readMap.compose(indexingMap)); Value read = mlir::vector::createReadOrMaskedRead( - rewriter, loc, opOperand.get(), readType.getShape(), + rewriter, loc, opOperand.get(), readType, /*padding=*/arith::getZeroConstant(rewriter, loc, elemType), - /*useInBoundsInsteadOfMasking=*/false, readType.getScalableDims()); + /*useInBoundsInsteadOfMasking=*/false); vecOperands.push_back(read); } @@ -3165,9 +3168,8 @@ vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp, SmallVector readIndices( vecType.getRank(), arith::ConstantIndexOp::create(rewriter, loc, 0)); Value read = mlir::vector::createReadOrMaskedRead( - rewriter, loc, source, vecType.getShape(), padValue, - /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty(), - /*inputScalableVecSizes=*/{}); + rewriter, loc, source, vecType, padValue, + /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty()); // Create write auto writeIndices = diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp index c809c50206793..c307fb441e3ad 100644 --- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp +++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp @@ -322,46 +322,61 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, std::optional padValue, bool useInBoundsInsteadOfMasking, ArrayRef inputScalableVecDims) { - assert(!llvm::is_contained(inputVectorSizes, ShapedType::kDynamic) && + VectorType vecToReadTy = VectorType::get( + inputVectorSizes, cast(source.getType()).getElementType(), + inputScalableVecDims); + + return createReadOrMaskedRead(builder, loc, source, vecToReadTy, padValue, + useInBoundsInsteadOfMasking); +} + +Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, + Value source, + const VectorType &vecToReadTy, + std::optional padValue, + bool useInBoundsInsteadOfMasking) { + assert(!llvm::is_contained(vecToReadTy.getScalableDims(), + ShapedType::kDynamic) && "invalid input vector sizes"); auto sourceShapedType = cast(source.getType()); auto sourceShape = sourceShapedType.getShape(); - assert(sourceShape.size() == inputVectorSizes.size() && + + int64_t vecToReadRank = vecToReadTy.getRank(); + auto vecToReadShape = vecToReadTy.getShape(); + + assert(sourceShape.size() == static_cast(vecToReadRank) && "expected same ranks."); - auto vectorType = - VectorType::get(inputVectorSizes, sourceShapedType.getElementType(), - inputScalableVecDims); assert((!padValue.has_value() || padValue.value().getType() == sourceShapedType.getElementType()) && "expected same pad element type to match source element type"); - int64_t readRank = inputVectorSizes.size(); + auto zero = arith::ConstantIndexOp::create(builder, loc, 0); - SmallVector inBoundsVal(readRank, true); + SmallVector inBoundsVal(vecToReadRank, true); if (useInBoundsInsteadOfMasking) { // Update the inBounds attribute. // FIXME: This computation is too weak - it ignores the read indices. - for (unsigned i = 0; i < readRank; i++) - inBoundsVal[i] = (sourceShape[i] == inputVectorSizes[i]) && + for (unsigned i = 0; i < vecToReadRank; i++) + inBoundsVal[i] = (sourceShape[i] == vecToReadShape[i]) && ShapedType::isStatic(sourceShape[i]); } auto transferReadOp = vector::TransferReadOp::create( builder, loc, - /*vectorType=*/vectorType, + /*vectorType=*/vecToReadTy, /*source=*/source, - /*indices=*/SmallVector(readRank, zero), + /*indices=*/SmallVector(vecToReadRank, zero), /*padding=*/padValue, /*inBounds=*/inBoundsVal); - if (llvm::equal(inputVectorSizes, sourceShape) || useInBoundsInsteadOfMasking) + if (llvm::equal(vecToReadTy.getShape(), sourceShape) || + useInBoundsInsteadOfMasking) return transferReadOp; SmallVector mixedSourceDims = isa(source.getType()) ? memref::getMixedSizes(builder, loc, source) : tensor::getMixedSizes(builder, loc, source); - auto maskType = VectorType::get(inputVectorSizes, builder.getI1Type(), - inputScalableVecDims); + auto maskType = vecToReadTy.cloneWith(/*shape=*/{}, builder.getI1Type()); Value mask = vector::CreateMaskOp::create(builder, loc, maskType, mixedSourceDims); return mlir::vector::maskOperation(builder, transferReadOp, mask) diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp deleted file mode 100644 index 0a05f7369e556..0000000000000 --- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp +++ /dev/null @@ -1,89 +0,0 @@ -//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file exposes the APFloat infrastructure to MLIR programs as a runtime -// library. APFloat is a software implementation of floating point arithmetics. -// -// On the MLIR side, floating-point values must be bitcasted to 64-bit integers -// before calling a runtime function. If a floating-point type has less than -// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an -// integer. -// -// Runtime functions receive the floating-point operands of the arithmeic -// operation in the form of 64-bit integers, along with the APFloat semantics -// in the form of a 32-bit integer, which will be interpreted as an -// APFloatBase::Semantics enum value. -// -#include "llvm/ADT/APFloat.h" - -#ifdef _WIN32 -#ifndef MLIR_APFLOAT_WRAPPERS_EXPORT -#ifdef mlir_apfloat_wrappers_EXPORTS -// We are building this library -#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllexport) -#else -// We are using this library -#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllimport) -#endif // mlir_apfloat_wrappers_EXPORTS -#endif // MLIR_APFLOAT_WRAPPERS_EXPORT -#else -// Non-windows: use visibility attributes. -#define MLIR_APFLOAT_WRAPPERS_EXPORT __attribute__((visibility("default"))) -#endif // _WIN32 - -/// Binary operations without rounding mode. -#define APFLOAT_BINARY_OP(OP) \ - MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP( \ - int32_t semantics, uint64_t a, uint64_t b) { \ - const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( \ - static_cast(semantics)); \ - unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); \ - llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a)); \ - llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b)); \ - lhs.OP(rhs); \ - return lhs.bitcastToAPInt().getZExtValue(); \ - } - -/// Binary operations with rounding mode. -#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE) \ - MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP( \ - int32_t semantics, uint64_t a, uint64_t b) { \ - const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( \ - static_cast(semantics)); \ - unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); \ - llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a)); \ - llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b)); \ - lhs.OP(rhs, ROUNDING_MODE); \ - return lhs.bitcastToAPInt().getZExtValue(); \ - } - -extern "C" { - -#define BIN_OPS_WITH_ROUNDING(X) \ - X(add, llvm::RoundingMode::NearestTiesToEven) \ - X(subtract, llvm::RoundingMode::NearestTiesToEven) \ - X(multiply, llvm::RoundingMode::NearestTiesToEven) \ - X(divide, llvm::RoundingMode::NearestTiesToEven) - -BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE) -#undef BIN_OPS_WITH_ROUNDING -#undef APFLOAT_BINARY_OP_ROUNDING_MODE - -APFLOAT_BINARY_OP(remainder) - -#undef APFLOAT_BINARY_OP - -MLIR_APFLOAT_WRAPPERS_EXPORT void printApFloat(int32_t semantics, uint64_t a) { - const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics( - static_cast(semantics)); - unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem); - llvm::APFloat x(sem, llvm::APInt(bitWidth, a)); - double d = x.convertToDouble(); - fprintf(stdout, "%lg", d); -} -} diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index 8c09e50e4de7b..fdeb4dacf9278 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -2,7 +2,6 @@ # is a big dependency which most don't need. set(LLVM_OPTIONAL_SOURCES - APFloatWrappers.cpp ArmRunnerUtils.cpp ArmSMEStubs.cpp AsyncRuntime.cpp @@ -168,15 +167,6 @@ if(LLVM_ENABLE_PIC) set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17) target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS) - add_mlir_library(mlir_apfloat_wrappers - SHARED - APFloatWrappers.cpp - - EXCLUDE_FROM_LIBMLIR - ) - set_property(TARGET mlir_apfloat_wrappers PROPERTY CXX_STANDARD 17) - target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS) - add_subdirectory(SparseTensor) add_mlir_library(mlir_c_runner_utils @@ -187,7 +177,6 @@ if(LLVM_ENABLE_PIC) EXCLUDE_FROM_LIBMLIR LINK_LIBS PUBLIC - mlir_apfloat_wrappers mlir_float16_utils MLIRSparseTensorEnums MLIRSparseTensorRuntime @@ -202,7 +191,6 @@ if(LLVM_ENABLE_PIC) EXCLUDE_FROM_LIBMLIR LINK_LIBS PUBLIC - mlir_apfloat_wrappers mlir_float16_utils ) target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS) diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir deleted file mode 100644 index 797f42c37a26f..0000000000000 --- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir +++ /dev/null @@ -1,128 +0,0 @@ -// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s - -// CHECK-LABEL: func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64 - -// CHECK-LABEL: func.func @foo() -> f8E4M3FN { -// CHECK: %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN -// CHECK: return %[[CONSTANT_0]] : f8E4M3FN -// CHECK: } - -// CHECK-LABEL: func.func @bar() -> f6E3M2FN { -// CHECK: %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN -// CHECK: return %[[CONSTANT_0]] : f6E3M2FN -// CHECK: } - -// Illustrate that both f8E4M3FN and f6E3M2FN calling the same _mlir_apfloat_add is fine -// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width. -// CHECK-LABEL: func.func @full_example() { -// CHECK: %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN -// CHECK: %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN -// CHECK: %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8 -// CHECK: %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64 -// CHECK: %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8 -// CHECK: %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64 -// // fltSemantics semantics for f8E4M3FN -// CHECK: %[[CONSTANT_1:.*]] = arith.constant 10 : i32 -// CHECK: %[[VAL_1:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64 -// CHECK: %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8 -// CHECK: %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN -// CHECK: vector.print %[[BITCAST_2]] : f8E4M3FN - -// CHECK: %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN -// CHECK: %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN -// CHECK: %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6 -// CHECK: %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64 -// CHECK: %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6 -// CHECK: %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64 -// // fltSemantics semantics for f6E3M2FN -// CHECK: %[[CONSTANT_3:.*]] = arith.constant 16 : i32 -// CHECK: %[[VAL_3:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64 -// CHECK: %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6 -// CHECK: %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN -// CHECK: vector.print %[[BITCAST_5]] : f6E3M2FN -// CHECK: return -// CHECK: } - -// Put rhs into separate function so that it won't be constant-folded. -func.func @foo() -> f8E4M3FN { - %cst = arith.constant 2.2 : f8E4M3FN - return %cst : f8E4M3FN -} - -func.func @bar() -> f6E3M2FN { - %cst = arith.constant 3.2 : f6E3M2FN - return %cst : f6E3M2FN -} - -func.func @full_example() { - %a = arith.constant 1.4 : f8E4M3FN - %b = func.call @foo() : () -> (f8E4M3FN) - %c = arith.addf %a, %b : f8E4M3FN - vector.print %c : f8E4M3FN - - %d = arith.constant 2.4 : f6E3M2FN - %e = func.call @bar() : () -> (f6E3M2FN) - %f = arith.addf %d, %e : f6E3M2FN - vector.print %f : f6E3M2FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.addf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// Test decl collision (different type) -// expected-error@+1{{matched function '_mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}} -func.func private @_mlir_apfloat_add(i32, i32, f32) -> index -func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.addf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_subtract(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.subf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_multiply(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.mulf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_divide(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.divf %arg0, %arg1 : f4E2M1FN - return -} - -// ----- - -// CHECK: func.func private @_mlir_apfloat_remainder(i32, i64, i64) -> i64 -// CHECK: %[[sem:.*]] = arith.constant 18 : i32 -// CHECK: call @_mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64 -func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) { - %0 = arith.remf %arg0, %arg1 : f4E2M1FN - return -} diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir deleted file mode 100644 index 2768afe0834b5..0000000000000 --- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir +++ /dev/null @@ -1,36 +0,0 @@ -// Case 1: All floating-point arithmetics is lowered through APFloat. -// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \ -// RUN: mlir-runner -e entry --entry-point-result=void \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --shared-libs=%mlir_apfloat_wrappers | FileCheck %s - -// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat. -// Arithmetics on f32 is lowered directly to LLVM. -// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \ -// RUN: --convert-to-llvm --reconcile-unrealized-casts | \ -// RUN: mlir-runner -e entry --entry-point-result=void \ -// RUN: --shared-libs=%mlir_c_runner_utils \ -// RUN: --shared-libs=%mlir_apfloat_wrappers | FileCheck %s - -// Put rhs into separate function so that it won't be constant-folded. -func.func @foo() -> (f8E4M3FN, f32) { - %cst1 = arith.constant 2.2 : f8E4M3FN - %cst2 = arith.constant 2.2 : f32 - return %cst1, %cst2 : f8E4M3FN, f32 -} - -func.func @entry() { - %a1 = arith.constant 1.4 : f8E4M3FN - %a2 = arith.constant 1.4 : f32 - %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32) - %c1 = arith.addf %a1, %b1 : f8E4M3FN // not supported by LLVM - %c2 = arith.addf %a2, %b2 : f32 // supported by LLVM - - // CHECK: 3.5 - vector.print %c1 : f8E4M3FN - - // CHECK: 3.6 - vector.print %c2 : f32 - - return -} diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 4a38ed605be0c..6ff12d66523f5 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -208,7 +208,6 @@ def find_real_python_interpreter(): add_runtime("mlir_c_runner_utils"), add_runtime("mlir_async_runtime"), add_runtime("mlir_float16_utils"), - add_runtime("mlir_apfloat_wrappers"), "mlir-linalg-ods-yaml-gen", "mlir-reduce", "mlir-pdll",